diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 66c1dfc71c2f5..13fc92f64e8b1 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17501,26 +17501,18 @@ Align SITargetLowering::computeKnownAlignForTargetInstr( Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); const Align CacheLineAlign = Align(64); - - // Pre-GFX10 target did not benefit from loop alignment - if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() || - getSubtarget()->hasInstFwdPrefetchBug()) + if (!ML || DisableLoopAlignment) return PrefAlign; - - // On GFX10 I$ is 4 x 64 bytes cache lines. - // By default prefetcher keeps one cache line behind and reads two ahead. - // We can modify it with S_INST_PREFETCH for larger loops to have two lines - // behind and one ahead. - // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. - // If loop fits 64 bytes it always spans no more than two cache lines and - // does not need an alignment. - // Else if loop is less or equal 128 bytes we do not need to modify prefetch, - // Else if loop is less or equal 192 bytes we need two lines behind. - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const MachineBasicBlock *Header = ML->getHeader(); if (Header->getAlignment() != PrefAlign) return Header->getAlignment(); // Already processed. + const MachineFunction *MF = Header->getParent(); + const Function &Fn = MF->getFunction(); + for (auto &BB : Fn) + for (auto &I : BB) + if (isa(&I)) + return PrefAlign; unsigned LoopSize = 0; for (const MachineBasicBlock *MBB : ML->blocks()) { @@ -17531,13 +17523,41 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { for (const MachineInstr &MI : *MBB) { LoopSize += TII->getInstSizeInBytes(MI); - if (LoopSize > 192) - return PrefAlign; } + if (LoopSize > 192) + break; + } + + if (!getSubtarget()->hasInstPrefetch() || + getSubtarget()->hasInstFwdPrefetchBug()) { + // Align loops < 32 bytes agrressively + if (LoopSize <= 32) + return Align(32); + // Align larger loops less aggressively + if (!ML->isInnermost()) + return PrefAlign; + return Align(16); + } + + // On GFX10 I$ is 4 x 64 bytes cache lines. + // By default prefetcher keeps one cache line behind and reads two ahead. + // We can modify it with S_INST_PREFETCH for larger loops to have two lines + // behind and one ahead. + // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. + // If loop fits 64 bytes it always spans no more than two cache lines and + // does not need an alignment driven by prefetch considerations. + // Else if loop is less or equal 128 bytes we do not need to modify prefetch, + // Else if loop is less or equal 192 bytes we need two lines behind. + + // Align larger loops less aggressively + if (LoopSize > 192) { + if (!ML->isInnermost()) + return PrefAlign; + return Align(16); } if (LoopSize <= 64) - return PrefAlign; + return Align(32); if (LoopSize <= 128) return CacheLineAlign; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 666523c88860c..969a86c9810bc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -331,6 +331,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -376,6 +377,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -400,6 +402,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -424,6 +427,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -478,6 +482,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -522,6 +527,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -545,6 +551,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -568,6 +575,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -610,6 +618,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -648,6 +657,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -694,6 +704,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -720,6 +731,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -767,6 +779,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -804,6 +817,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -849,6 +863,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -873,6 +888,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -926,6 +942,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -971,6 +988,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -995,6 +1013,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1019,6 +1038,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1069,6 +1089,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1115,6 +1136,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1138,6 +1160,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1161,6 +1184,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1200,6 +1224,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1238,6 +1263,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1284,6 +1310,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1313,6 +1340,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1355,6 +1383,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1392,6 +1421,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1438,6 +1468,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1465,6 +1496,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1518,6 +1550,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX942-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1567,6 +1600,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1593,6 +1627,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1620,6 +1655,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1674,6 +1710,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1722,6 +1759,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1747,6 +1785,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1773,6 +1812,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1817,6 +1857,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1859,6 +1900,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1912,6 +1954,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1943,6 +1986,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1989,6 +2033,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2029,6 +2074,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2079,6 +2125,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -2108,6 +2155,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 351502816ae6e..8dbf77fdb8e32 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -331,6 +331,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -376,6 +377,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -400,6 +402,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -424,6 +427,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -478,6 +482,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -522,6 +527,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -545,6 +551,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -568,6 +575,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -610,6 +618,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -648,6 +657,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -694,6 +704,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -720,6 +731,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -767,6 +779,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -804,6 +817,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -849,6 +863,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -873,6 +888,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -926,6 +942,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -971,6 +988,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -995,6 +1013,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1019,6 +1038,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1069,6 +1089,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1115,6 +1136,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1138,6 +1160,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1161,6 +1184,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1200,6 +1224,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1238,6 +1263,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1284,6 +1310,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1313,6 +1340,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1355,6 +1383,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1392,6 +1421,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1438,6 +1468,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1465,6 +1496,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1518,6 +1550,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX942-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1567,6 +1600,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1593,6 +1627,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1620,6 +1655,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1674,6 +1710,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1722,6 +1759,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1747,6 +1785,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1773,6 +1812,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1817,6 +1857,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1859,6 +1900,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1912,6 +1954,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1943,6 +1986,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1989,6 +2033,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2029,6 +2074,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2079,6 +2125,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -2108,6 +2155,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll index 5dff8c16f7c89..d6fa27769b357 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll @@ -105,6 +105,7 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) { ; GFX10-NEXT: s_mov_b32 s5, 1 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: ; implicit-def: $sgpr7 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB2_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6 @@ -154,6 +155,7 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: s_branch .LBB3_2 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB3_1: ; %loop_body ; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v8, s6 @@ -247,6 +249,7 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 ; GFX10-NEXT: s_cbranch_vccnz .LBB4_4 ; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader ; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB4_2: ; %.preheader ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v3, s12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index dd01112d97a18..986b4f9221b72 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -17,6 +17,7 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val, ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: ; implicit-def: $sgpr7 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB0_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6 @@ -66,6 +67,7 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr ; GFX10-NEXT: s_or_b32 s7, s5, s6 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB1_2 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB1_1: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 @@ -137,6 +139,7 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val, ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: ; implicit-def: $sgpr7 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB2_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6 @@ -183,6 +186,7 @@ define void @divergent_i1_xor_used_outside_loop_twice(float %val, float %pre.con ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s7, 0 ; GFX10-NEXT: ; implicit-def: $sgpr6 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB3_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s7 @@ -249,6 +253,7 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ; GFX10-NEXT: ; implicit-def: $sgpr11 ; GFX10-NEXT: ; implicit-def: $sgpr9 ; GFX10-NEXT: s_branch .LBB4_3 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB4_2: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 @@ -349,6 +354,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: s_branch .LBB5_2 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB5_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 @@ -462,6 +468,7 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa ; GFX10-NEXT: ; implicit-def: $sgpr4 ; GFX10-NEXT: ; implicit-def: $sgpr3 ; GFX10-NEXT: s_branch .LBB6_2 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB6_1: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 @@ -540,6 +547,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB7_2 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB7_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll index fd08ab88990ed..8fffa7749dd44 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll @@ -110,6 +110,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB2_2 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB2_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -184,6 +185,7 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB3_3 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB3_1: ; %Flow3 ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -282,6 +284,7 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB4_4 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB4_1: ; %Flow5 ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -410,6 +413,7 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad ; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB5_2 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB5_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -572,6 +576,7 @@ define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2, ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 ; GFX10-NEXT: v_cmp_le_i32_e64 s0, v4, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB6_6: ; %.inner_loop ; GFX10-NEXT: ; Parent Loop BB6_2 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll index d13d6a19d332a..8b619231e3dff 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll @@ -9,6 +9,7 @@ define void @temporal_divergent_i1_phi(float %val, ptr %addr) { ; GFX10-NEXT: s_mov_b32 s5, 1 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: ; implicit-def: $sgpr7 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB0_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6 @@ -56,6 +57,7 @@ define void @temporal_divergent_i1_non_phi(float %val, ptr %addr) { ; GFX10-NEXT: s_mov_b32 s5, 1 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: ; implicit-def: $sgpr7 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB1_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6 @@ -106,6 +108,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad ; GFX10-NEXT: ; implicit-def: $sgpr10 ; GFX10-NEXT: ; implicit-def: $sgpr9 ; GFX10-NEXT: s_branch .LBB2_3 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB2_1: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB2_3 Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v4, s6 @@ -214,6 +217,7 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n. ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo ; GFX10-NEXT: flat_load_dword v0, v[6:7] +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB3_2: ; %InnerHeader ; GFX10-NEXT: ; Parent Loop BB3_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 @@ -305,6 +309,7 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n. ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo ; GFX10-NEXT: flat_load_dword v0, v[6:7] +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB4_2: ; %InnerHeader ; GFX10-NEXT: ; Parent Loop BB4_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 @@ -396,6 +401,7 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo ; GFX10-NEXT: flat_load_dword v0, v[8:9] +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB5_2: ; %InnerHeader ; GFX10-NEXT: ; Parent Loop BB5_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll index d4e5487828c48..ce97935ae7afc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll @@ -7,6 +7,7 @@ define void @temporal_divergent_i32(float %val, ptr %addr) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, -1 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB0_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_add_i32 s5, s5, 1 @@ -42,6 +43,7 @@ define void @temporal_divergent_i32_multiple_use(float %val, ptr %addr, ptr %add ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, -1 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB1_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_add_i32 s5, s5, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 6148bc2d5ae6e..5c1d06043b962 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -205,6 +205,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) { ; CHECK-NEXT: v_subrev_u32_e32 v0, s0, v0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_branch .LBB5_2 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB5_1: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll index e3b92508c2997..f613f7b0ce7d3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll @@ -38,7 +38,7 @@ define amdgpu_ps void @_amdgpu_ps_main(i1 %arg) { ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, s1 ; CHECK-NEXT: s_branch .LBB0_4 -; CHECK-NEXT: .p2align 6 +; CHECK-NOT: .p2align 6 ; CHECK-NEXT: .LBB0_3: ; %bb6 ; CHECK-NEXT: ; in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll index 8a53c862371cf..e64032926f1b3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -901,6 +901,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_branch .LBB7_5 +; SI-NEXT: .p2align ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -967,6 +968,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_branch .LBB7_5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1032,6 +1034,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-32-NEXT: s_branch .LBB7_5 +; GFX10-32-NEXT: .p2align ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 @@ -1096,6 +1099,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-64-NEXT: s_branch .LBB7_5 +; GFX10-64-NEXT: .p2align ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll index e0016b0a5a64d..aa6def29a3131 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll @@ -12,6 +12,7 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) ; LOOP-NEXT: s_mov_b32 s3, 0xf000 ; LOOP-NEXT: v_mov_b32_e32 v5, s1 ; LOOP-NEXT: v_mov_b32_e32 v4, s0 +; LOOP-NEXT: .p2align ; LOOP-NEXT: .LBB0_1: ; %load-store-loop ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 ; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll index 04652af147f9b..1f8879d311617 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll @@ -12,6 +12,7 @@ define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) { ; LOOP-NEXT: s_mov_b32 s3, 0xf000 ; LOOP-NEXT: v_mov_b32_e32 v4, s1 ; LOOP-NEXT: v_mov_b32_e32 v3, s0 +; LOOP-NEXT: .p2align ; LOOP-NEXT: .LBB0_1: ; %loadstoreloop ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 ; LOOP-NEXT: v_add_i32_e32 v5, vcc, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll index 5240bf4f3a1d7..1bb0a787739f9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll @@ -433,6 +433,7 @@ define amdgpu_ps void @divergent_because_of_temporal_divergent_use(float %val, p ; OLD_RBS-NEXT: s_mov_b32 s0, -1 ; OLD_RBS-NEXT: v_mov_b32_e32 v3, s0 ; OLD_RBS-NEXT: s_mov_b32 s0, 0 +; OLD_RBS-NEXT: .p2align ; OLD_RBS-NEXT: .LBB15_1: ; %loop ; OLD_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 ; OLD_RBS-NEXT: v_add_nc_u32_e32 v3, 1, v3 @@ -451,6 +452,7 @@ define amdgpu_ps void @divergent_because_of_temporal_divergent_use(float %val, p ; NEW_RBS: ; %bb.0: ; %entry ; NEW_RBS-NEXT: s_mov_b32 s1, -1 ; NEW_RBS-NEXT: s_mov_b32 s0, 0 +; NEW_RBS-NEXT: .p2align ; NEW_RBS-NEXT: .LBB15_1: ; %loop ; NEW_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 ; NEW_RBS-NEXT: s_add_i32 s1, s1, 1 @@ -489,6 +491,7 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; OLD_RBS-NEXT: ; implicit-def: $sgpr1 ; OLD_RBS-NEXT: v_mov_b32_e32 v6, s0 ; OLD_RBS-NEXT: s_branch .LBB16_3 +; OLD_RBS-NEXT: .p2align ; OLD_RBS-NEXT: .LBB16_1: ; %Flow3 ; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; OLD_RBS-NEXT: s_waitcnt_depctr 0xffe3 @@ -551,6 +554,7 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: s_mov_b32 s0, 0 ; NEW_RBS-NEXT: ; implicit-def: $sgpr5 ; NEW_RBS-NEXT: s_branch .LBB16_3 +; NEW_RBS-NEXT: .p2align ; NEW_RBS-NEXT: .LBB16_1: ; %Flow3 ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; NEW_RBS-NEXT: s_waitcnt_depctr 0xffe3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll index 9c2fabce4bcde..6851b98f95391 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -596,6 +596,7 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp ; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX906-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX906-NEXT: v_mov_b32_e32 v2, 24 +; GFX906-NEXT: .p2align ; GFX906-NEXT: .LBB10_1: ; %bb.1 ; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v1 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 3160e38df5e3f..ec0f1e6fef551 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -598,6 +598,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_mul_i32 s9, s6, s9 ; GFX908-NEXT: s_add_i32 s13, s13, s23 ; GFX908-NEXT: s_branch .LBB3_5 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -649,6 +650,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 ; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 ; GFX908-NEXT: s_branch .LBB3_4 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: s_mov_b64 s[22:23], s[18:19] ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] @@ -759,6 +761,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_mul_i32 s9, s6, s9 ; GFX90A-NEXT: s_add_i32 s13, s13, s23 ; GFX90A-NEXT: s_branch .LBB3_5 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -803,6 +806,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17] ; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] ; GFX90A-NEXT: s_branch .LBB3_4 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: s_mov_b64 s[22:23], s[18:19] ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] diff --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll index e03c9ca34b825..3d2ba32e532c0 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll @@ -14,6 +14,7 @@ define amdgpu_ps void @main(i32 %arg) { ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_branch .LBB0_2 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB0_1: ; in Loop: Header=BB0_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 394727c88b0be..9044227b0c9a9 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -565,6 +565,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -608,6 +609,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -649,6 +651,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -689,6 +692,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .p2align ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -729,6 +733,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .p2align ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 @@ -769,6 +774,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: .p2align ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -813,6 +819,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: s_mov_b32 s0, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: .p2align ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 @@ -854,6 +861,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: .p2align ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -899,6 +907,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: s_mov_b32 s0, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: .p2align ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 @@ -950,6 +959,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB3_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -995,6 +1005,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1038,6 +1049,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1080,6 +1092,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .p2align ; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1123,6 +1136,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .p2align ; GFX10W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 @@ -1166,6 +1180,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: .p2align ; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -1213,6 +1228,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: s_mov_b32 s0, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: .p2align ; GFX11W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 @@ -1256,6 +1272,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: .p2align ; GFX12W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -1304,6 +1321,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: s_mov_b32 s0, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: .p2align ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 @@ -2026,6 +2044,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB7_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -2069,6 +2088,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -2110,6 +2130,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -2150,6 +2171,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .p2align ; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -2190,6 +2212,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .p2align ; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 @@ -2230,6 +2253,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: .p2align ; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -2274,6 +2298,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: s_mov_b32 s0, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: .p2align ; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 @@ -2316,6 +2341,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: .p2align ; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -2361,6 +2387,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: s_mov_b32 s0, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: .p2align ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 0fccdba729132..299bbde344b3c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -666,6 +666,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -713,6 +714,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -758,6 +760,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -803,6 +806,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] @@ -849,6 +853,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 @@ -895,6 +900,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] @@ -944,6 +950,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 @@ -991,6 +998,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1264_ITERATIVE-NEXT: .p2align ; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] @@ -1039,6 +1047,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1232_ITERATIVE-NEXT: .p2align ; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 @@ -2296,6 +2305,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -2351,6 +2361,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -2404,6 +2415,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -2457,6 +2469,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -2510,6 +2523,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 @@ -2563,6 +2577,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -2620,6 +2635,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 @@ -2675,6 +2691,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1264_ITERATIVE-NEXT: .p2align ; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] @@ -2729,6 +2746,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1232_ITERATIVE-NEXT: .p2align ; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 @@ -4201,6 +4219,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -4248,6 +4267,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -4293,6 +4313,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -4338,6 +4359,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] @@ -4384,6 +4406,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 @@ -4430,6 +4453,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] @@ -4479,6 +4503,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 @@ -4526,6 +4551,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1264_ITERATIVE-NEXT: .p2align ; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] @@ -4574,6 +4600,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1232_ITERATIVE-NEXT: .p2align ; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 @@ -5862,6 +5889,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -5917,6 +5945,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -5970,6 +5999,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -6023,6 +6053,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] @@ -6076,6 +6107,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 @@ -6129,6 +6161,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -6186,6 +6219,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 @@ -6241,6 +6275,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1264_ITERATIVE-NEXT: .p2align ; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] @@ -6295,6 +6330,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1232_ITERATIVE-NEXT: .p2align ; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 @@ -7651,6 +7687,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s15 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: .p2align ; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s14, v1 @@ -7712,6 +7749,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v1 @@ -7771,6 +7809,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_u32_e32 v0, s14, v1 @@ -7830,6 +7869,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .p2align ; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v1 @@ -7887,6 +7927,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: .p2align ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v1 @@ -7946,6 +7987,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-TRUE16-NEXT: .p2align ; GFX1164-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8011,6 +8053,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-FAKE16-NEXT: .p2align ; GFX1164-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8075,6 +8118,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-TRUE16-NEXT: .p2align ; GFX1132-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8137,6 +8181,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-FAKE16-NEXT: .p2align ; GFX1132-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8201,6 +8246,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1264-TRUE16-NEXT: .p2align ; GFX1264-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8267,6 +8313,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1264-FAKE16-NEXT: .p2align ; GFX1264-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8334,6 +8381,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-TRUE16-NEXT: .p2align ; GFX1232-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8400,6 +8448,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-FAKE16-NEXT: .p2align ; GFX1232-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8457,6 +8506,7 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: .p2align ; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_and_b32_e32 v0, s3, v1 @@ -8498,6 +8548,7 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_and_b32_e32 v0, s9, v1 @@ -8538,6 +8589,7 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_and_b32_e32 v0, s9, v1 @@ -8579,6 +8631,7 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .p2align ; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_and_or_b32 v0, v1, s9, s10 @@ -8619,6 +8672,7 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: .p2align ; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_and_or_b32 v0, v1, s3, s8 @@ -8659,6 +8713,7 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .p2align ; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -8701,6 +8756,7 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: .p2align ; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8742,6 +8798,7 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-NEXT: s_mov_b64 s[2:3], 0 +; GFX1264-NEXT: .p2align ; GFX1264-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -8784,6 +8841,7 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mov_b32_e32 v1, s7 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-NEXT: .p2align ; GFX1232-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9340,6 +9398,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s15 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: .p2align ; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s14, v1 @@ -9401,6 +9460,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v1 @@ -9460,6 +9520,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_u32_e32 v0, s14, v1 @@ -9519,6 +9580,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .p2align ; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v1 @@ -9576,6 +9638,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: .p2align ; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v1 @@ -9635,6 +9698,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-TRUE16-NEXT: .p2align ; GFX1164-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9699,6 +9763,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-FAKE16-NEXT: .p2align ; GFX1164-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9763,6 +9828,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-TRUE16-NEXT: .p2align ; GFX1132-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9824,6 +9890,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-FAKE16-NEXT: .p2align ; GFX1132-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9888,6 +9955,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1264-TRUE16-NEXT: .p2align ; GFX1264-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9953,6 +10021,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1264-FAKE16-NEXT: .p2align ; GFX1264-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10020,6 +10089,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-TRUE16-NEXT: .p2align ; GFX1232-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10085,6 +10155,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-FAKE16-NEXT: .p2align ; GFX1232-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10142,6 +10213,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: .p2align ; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_and_b32_e32 v0, s3, v1 @@ -10183,6 +10255,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_and_b32_e32 v0, s9, v1 @@ -10223,6 +10296,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_and_b32_e32 v0, s9, v1 @@ -10264,6 +10338,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .p2align ; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_and_or_b32 v0, v1, s9, s10 @@ -10304,6 +10379,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: .p2align ; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_and_or_b32 v0, v1, s3, s8 @@ -10344,6 +10420,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .p2align ; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -10386,6 +10463,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: .p2align ; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10427,6 +10505,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-NEXT: s_mov_b64 s[2:3], 0 +; GFX1264-NEXT: .p2align ; GFX1264-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -10469,6 +10548,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mov_b32_e32 v1, s7 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-NEXT: .p2align ; GFX1232-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10515,6 +10595,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: .p2align ; GFX7LESS-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v1 @@ -10559,6 +10640,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX8-NEXT: s_not_b32 s2, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 @@ -10600,6 +10682,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX9-NEXT: s_not_b32 s2, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v1 @@ -10641,6 +10724,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .p2align ; GFX1064-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v1 @@ -10682,6 +10766,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s6 ; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: .p2align ; GFX1032-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v1 @@ -10724,6 +10809,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-TRUE16-NEXT: .p2align ; GFX1164-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -10771,6 +10857,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-FAKE16-NEXT: .p2align ; GFX1164-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10818,6 +10905,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s6 ; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX1132-TRUE16-NEXT: .p2align ; GFX1132-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -10864,6 +10952,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s6 ; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX1132-FAKE16-NEXT: .p2align ; GFX1132-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10910,6 +10999,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1264-TRUE16-NEXT: .p2align ; GFX1264-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -10957,6 +11047,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1264-FAKE16-NEXT: .p2align ; GFX1264-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11004,6 +11095,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s6 ; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX1232-TRUE16-NEXT: .p2align ; GFX1232-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -11050,6 +11142,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s6 ; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX1232-FAKE16-NEXT: .p2align ; GFX1232-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11102,6 +11195,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: .p2align ; GFX7LESS-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v1 @@ -11147,6 +11241,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s10 @@ -11197,6 +11292,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11244,6 +11340,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: .p2align ; GFX1064-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11291,6 +11388,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: .p2align ; GFX1032-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_lshrrev_b32_sdwa v0, s2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11562,6 +11660,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264-TRUE16-NEXT: .p2align ; GFX1264-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11619,6 +11718,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264-FAKE16-NEXT: .p2align ; GFX1264-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11675,6 +11775,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-TRUE16-NEXT: .p2align ; GFX1232-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11731,6 +11832,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-FAKE16-NEXT: .p2align ; GFX1232-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11794,6 +11896,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_mov_b32 s4, s2 ; GFX7LESS-NEXT: s_mov_b32 s5, s3 +; GFX7LESS-NEXT: .p2align ; GFX7LESS-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -11845,6 +11948,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_mov_b32 s5, s3 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s11 @@ -11880,6 +11984,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_pk_add_f16 v0, v1, s10 @@ -11913,6 +12018,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: s_mov_b32 s4, s2 +; GFX1064-NEXT: .p2align ; GFX1064-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_pk_add_f16 v0, v1, s10 @@ -11946,6 +12052,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: s_mov_b32 s4, s2 +; GFX1032-NEXT: .p2align ; GFX1032-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_pk_add_f16 v0, v1, s8 @@ -11979,6 +12086,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-NEXT: s_mov_b32 s4, s2 +; GFX1164-NEXT: .p2align ; GFX1164-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -12014,6 +12122,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mov_b32_e32 v1, s4 ; GFX1132-NEXT: s_mov_b32 s4, s2 +; GFX1132-NEXT: .p2align ; GFX1132-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -12090,6 +12199,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_mov_b32 s4, s2 ; GFX7LESS-NEXT: s_mov_b32 s5, s3 +; GFX7LESS-NEXT: .p2align ; GFX7LESS-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -12139,6 +12249,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX8-NEXT: s_mov_b32 s5, s11 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -12192,6 +12303,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_mov_b32 s4, s10 ; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -12241,6 +12353,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1064-NEXT: s_mov_b32 s5, s11 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s1 +; GFX1064-NEXT: .p2align ; GFX1064-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -12290,6 +12403,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: s_mov_b32 s4, s10 +; GFX1032-NEXT: .p2align ; GFX1032-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 17737cccec7c4..4790641c7eac5 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -453,6 +453,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -495,6 +496,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -536,6 +538,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -576,6 +579,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -617,6 +621,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 @@ -658,6 +663,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -702,6 +708,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 @@ -1055,6 +1062,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1085,6 +1093,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1114,6 +1123,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1142,6 +1152,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1171,6 +1182,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 @@ -1200,6 +1212,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -1233,6 +1246,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 @@ -1960,6 +1974,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] @@ -2010,6 +2025,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] @@ -2059,6 +2075,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] @@ -2107,6 +2124,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] @@ -2155,6 +2173,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 @@ -2203,6 +2222,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] @@ -2255,6 +2275,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 @@ -2837,6 +2858,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] @@ -2872,6 +2894,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] @@ -2905,6 +2928,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] @@ -2937,6 +2961,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] @@ -2970,6 +2995,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 @@ -3003,6 +3029,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s4, s[2:3] @@ -3039,6 +3066,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 @@ -3860,6 +3888,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -3902,6 +3931,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -3943,6 +3973,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -3983,6 +4014,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -4024,6 +4056,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 @@ -4065,6 +4098,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -4109,6 +4143,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 @@ -4462,6 +4497,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -4492,6 +4528,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -4521,6 +4558,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -4549,6 +4587,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -4578,6 +4617,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 @@ -4607,6 +4647,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -4640,6 +4681,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 @@ -5390,6 +5432,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] @@ -5440,6 +5483,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] @@ -5489,6 +5533,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] @@ -5537,6 +5582,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] @@ -5585,6 +5631,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 @@ -5633,6 +5680,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] @@ -5685,6 +5733,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 @@ -6267,6 +6316,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -6309,6 +6359,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -6350,6 +6401,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -6390,6 +6442,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -6431,6 +6484,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 @@ -6472,6 +6526,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -6516,6 +6571,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 @@ -6872,6 +6928,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -6920,6 +6977,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -6967,6 +7025,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -7013,6 +7072,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] @@ -7060,6 +7120,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 @@ -7107,6 +7168,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] @@ -7157,6 +7219,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 @@ -7626,6 +7689,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -7668,6 +7732,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -7709,6 +7774,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -7749,6 +7815,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -7790,6 +7857,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 @@ -7831,6 +7899,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -7875,6 +7944,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 @@ -8230,6 +8300,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -8278,6 +8349,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -8325,6 +8397,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -8371,6 +8444,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] @@ -8418,6 +8492,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 @@ -8465,6 +8540,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] @@ -8515,6 +8591,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 @@ -8984,6 +9061,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -9026,6 +9104,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -9067,6 +9146,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -9107,6 +9187,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -9148,6 +9229,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 @@ -9189,6 +9271,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -9233,6 +9316,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 @@ -9588,6 +9672,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -9636,6 +9721,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -9683,6 +9769,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -9729,6 +9816,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] @@ -9776,6 +9864,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 @@ -9823,6 +9912,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] @@ -9873,6 +9963,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 @@ -10342,6 +10433,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s2, 1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -10384,6 +10476,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_brev_b32 s2, 1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -10425,6 +10518,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_brev_b32 s2, 1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -10465,6 +10559,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_brev_b32 s2, 1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -10506,6 +10601,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1032_ITERATIVE-NEXT: s_brev_b32 s0, 1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 @@ -10547,6 +10643,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_brev_b32 s2, 1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -10591,6 +10688,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_brev_b32 s0, 1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 @@ -11182,6 +11280,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -11239,6 +11338,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -11295,6 +11395,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -11350,6 +11451,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] @@ -11402,6 +11504,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 @@ -12168,6 +12271,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s2, -2 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -12210,6 +12314,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_brev_b32 s2, -2 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -12251,6 +12356,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_brev_b32 s2, -2 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -12291,6 +12397,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_brev_b32 s2, -2 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -12332,6 +12439,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1032_ITERATIVE-NEXT: s_brev_b32 s0, -2 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 @@ -12373,6 +12481,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_brev_b32 s2, -2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -12417,6 +12526,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_brev_b32 s0, -2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 @@ -13008,6 +13118,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -13065,6 +13176,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -13121,6 +13233,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -13176,6 +13289,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] @@ -13228,6 +13342,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 @@ -13994,6 +14109,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -14036,6 +14152,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -14077,6 +14194,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -14117,6 +14235,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -14158,6 +14277,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 @@ -14199,6 +14319,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -14243,6 +14364,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 @@ -14829,6 +14951,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -14885,6 +15008,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -14940,6 +15064,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -14994,6 +15119,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] @@ -15045,6 +15171,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 @@ -15807,6 +15934,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -15849,6 +15977,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -15890,6 +16019,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -15930,6 +16060,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -15971,6 +16102,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 @@ -16012,6 +16144,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align ; GFX1164_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -16056,6 +16189,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align ; GFX1132_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 @@ -16643,6 +16777,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .p2align ; GFX7LESS_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -16699,6 +16834,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .p2align ; GFX8_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -16754,6 +16890,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .p2align ; GFX9_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] @@ -16808,6 +16945,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .p2align ; GFX1064_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] @@ -16859,6 +16997,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .p2align ; GFX1032_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index e4def28667ed4..c957e701f5bb2 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -564,6 +564,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -607,6 +608,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -648,6 +650,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -688,6 +691,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .p2align ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -728,6 +732,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .p2align ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 @@ -768,6 +773,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: .p2align ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -812,6 +818,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: s_mov_b32 s0, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: .p2align ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 @@ -853,6 +860,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: .p2align ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -898,6 +906,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: s_mov_b32 s0, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: .p2align ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 @@ -1618,6 +1627,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB6_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1661,6 +1671,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB6_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1702,6 +1713,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB6_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1742,6 +1754,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .p2align ; GFX10W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1782,6 +1795,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .p2align ; GFX10W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 @@ -1822,6 +1836,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: .p2align ; GFX11W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -1866,6 +1881,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: s_mov_b32 s0, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: .p2align ; GFX11W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 @@ -1908,6 +1924,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: .p2align ; GFX12W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -1953,6 +1970,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: s_mov_b32 s0, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: .p2align ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 39a3c9aade586..06ea2d061a13a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -580,6 +580,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -624,6 +625,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -666,6 +668,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -707,6 +710,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .p2align ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -748,6 +752,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .p2align ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 @@ -789,6 +794,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: .p2align ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -834,6 +840,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: s_mov_b32 s0, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: .p2align ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 @@ -875,6 +882,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: .p2align ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -921,6 +929,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: s_mov_b32 s0, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: .p2align ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 @@ -1785,6 +1794,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB7_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1829,6 +1839,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1871,6 +1882,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1912,6 +1924,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .p2align ; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] @@ -1953,6 +1966,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .p2align ; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 @@ -1994,6 +2008,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: .p2align ; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -2039,6 +2054,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: s_mov_b32 s0, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: .p2align ; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 @@ -2081,6 +2097,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: .p2align ; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] @@ -2127,6 +2144,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: s_mov_b32 s0, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: .p2align ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index e4323999d19c3..0651f2dff0679 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -11,6 +11,7 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -123,6 +124,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -362,6 +364,7 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -383,6 +386,7 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -411,6 +415,7 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: flat_load_b32 v3, v[0:1] ; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: .p2align ; GFX1100-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll index 2cd50b3b1b2a2..71c8f81933d0a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -7,6 +7,7 @@ define i32 @atomic_nand_i32_lds(ptr addrspace(3) %ptr) nounwind { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -33,6 +34,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_dword v2, v[0:1], off ; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -60,6 +62,7 @@ define i32 @atomic_nand_i32_flat(ptr %ptr) nounwind { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll index 5fc9f4a0f8038..cd1d3e25a8b17 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll @@ -352,6 +352,7 @@ define i16 @global_one_as_atomic_min_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -394,6 +395,7 @@ define i16 @global_one_as_atomic_umin_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -436,6 +438,7 @@ define i16 @global_one_as_atomic_max_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -478,6 +481,7 @@ define i16 @global_one_as_atomic_umax_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -1332,6 +1336,7 @@ define i16 @flat_one_as_atomic_min_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1374,6 +1379,7 @@ define i16 @flat_one_as_atomic_umin_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1416,6 +1422,7 @@ define i16 @flat_one_as_atomic_max_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1458,6 +1465,7 @@ define i16 @flat_one_as_atomic_umax_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll index 614b1e38a530f..5651d2b82e4c0 100644 --- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll @@ -88,6 +88,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[18:19] ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[16:17] ; CHECK-NEXT: s_branch .LBB0_6 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_5: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 ; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll index 722dff0e18a23..a643cad4f7b63 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s +; RUN: llc -amdgpu-disable-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1030 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s +; RUN: llc -amdgpu-disable-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s ; For gfx1010, overestimate the branch size in case we need to insert ; a nop for the buggy offset. @@ -15,7 +15,8 @@ ; GFX1010-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} ; GFX1010-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ([[ENDBB:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295 ; GFX1010-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ([[ENDBB:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])>>32 -; GFX1010: [[RELAX_BB]]: +; GFX1010: .p2align 4 +; GFX1010-NEXT: [[RELAX_BB]]: ; GCN: v_nop ; GCN: s_sleep diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index 243f0ed3a8d0d..73c3e557bc69d 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -200,6 +200,7 @@ define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: long_backward_sbranch: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB4_1: ; %bb2 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -302,6 +303,7 @@ define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr ad ; GCN-LABEL: uniform_unconditional_min_long_backward_branch: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_mov_b32 vcc_lo, exec_lo +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB6_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ;;#ASMSTART @@ -472,6 +474,7 @@ define amdgpu_kernel void @analyze_mask_branch() #0 { ; GCN-NEXT: .Lpost_addpc11: ; GCN-NEXT: .LBB9_3: ; %loop.preheader ; GCN-NEXT: s_mov_b32 vcc_lo, 0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB9_4: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 5959f76492f3c..f9ac704f7e985 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -474,6 +474,7 @@ define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: long_backward_sbranch: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB4_1: ; %bb2 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_add_i32 s0, s0, 1 @@ -524,6 +525,7 @@ define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 { ; GFX12-LABEL: long_backward_sbranch: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB4_1: ; %bb2 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -747,6 +749,7 @@ define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr ad ; GCN-LABEL: uniform_unconditional_min_long_backward_branch: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_and_b64 vcc, exec, -1 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB6_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ;;#ASMSTART @@ -795,6 +798,7 @@ define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr ad ; GFX12-LABEL: uniform_unconditional_min_long_backward_branch: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b32 vcc_lo, exec_lo +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB6_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: ;;#ASMSTART @@ -1109,6 +1113,7 @@ define amdgpu_kernel void @analyze_mask_branch() #0 { ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB9_3: ; %loop.preheader ; GCN-NEXT: s_and_b64 vcc, exec, 0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB9_4: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ;;#ASMSTART @@ -1211,6 +1216,7 @@ define amdgpu_kernel void @analyze_mask_branch() #0 { ; GFX12-NEXT: s_setpc_b64 s[0:1] ; GFX12-NEXT: .LBB9_3: ; %loop.preheader ; GFX12-NEXT: s_mov_b32 vcc_lo, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB9_4: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 0ceb9019eb990..f69e71a9d27ae 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -61,6 +61,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -99,6 +100,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -126,6 +128,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -153,6 +156,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -180,6 +184,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -249,6 +254,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -295,6 +301,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -321,6 +328,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -347,6 +355,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -834,6 +843,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -863,6 +873,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -889,6 +900,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -916,6 +928,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -943,6 +956,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -970,6 +984,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1039,6 +1054,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1067,6 +1083,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1092,6 +1109,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1118,6 +1136,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1144,6 +1163,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1170,6 +1190,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1229,6 +1250,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1259,6 +1281,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1288,6 +1311,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1314,6 +1338,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1341,6 +1366,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1368,6 +1394,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1395,6 +1422,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1454,6 +1482,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1484,6 +1513,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1513,6 +1543,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1539,6 +1570,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1566,6 +1598,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1593,6 +1626,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1620,6 +1654,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1679,6 +1714,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1709,6 +1745,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1738,6 +1775,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1764,6 +1802,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1791,6 +1830,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1818,6 +1858,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1845,6 +1886,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1888,6 +1930,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1930,6 +1973,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1962,6 +2006,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v6, s4 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2004,6 +2049,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -2035,6 +2081,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2066,6 +2113,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2097,6 +2145,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2138,6 +2187,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2179,6 +2229,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2209,6 +2260,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v6, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2249,6 +2301,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -2278,6 +2331,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2307,6 +2361,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2336,6 +2391,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2908,6 +2964,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2950,6 +3007,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2982,6 +3040,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: v_mov_b32_e32 v6, s4 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3015,6 +3074,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3043,6 +3103,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -3074,6 +3135,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3105,6 +3167,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3136,6 +3199,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3178,6 +3242,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3220,6 +3285,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3252,6 +3318,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v6, s4 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3294,6 +3361,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -3325,6 +3393,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3356,6 +3425,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3387,6 +3457,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3439,6 +3510,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -3487,6 +3559,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -3528,6 +3601,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -3564,6 +3638,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3605,6 +3680,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .p2align ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -3645,6 +3721,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3681,6 +3758,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3714,6 +3792,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -3748,6 +3827,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3785,6 +3865,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3825,6 +3906,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3878,6 +3960,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -3925,6 +4008,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -3965,6 +4049,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -4000,6 +4085,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -4040,6 +4126,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .p2align ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -4079,6 +4166,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4114,6 +4202,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4146,6 +4235,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -4179,6 +4269,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -4215,6 +4306,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4253,6 +4345,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5106,6 +5199,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -5168,6 +5262,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -5220,6 +5315,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -5367,6 +5463,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5410,6 +5507,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5450,6 +5548,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -5490,6 +5589,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -5534,6 +5634,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5574,6 +5675,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5627,6 +5729,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -5688,6 +5791,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -5739,6 +5843,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -5883,6 +5988,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5925,6 +6031,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5964,6 +6071,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -6003,6 +6111,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -6046,6 +6155,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6084,6 +6194,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7034,6 +7145,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -7064,6 +7176,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7102,6 +7215,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -7129,6 +7243,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -7165,6 +7280,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -7211,6 +7327,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -7282,6 +7399,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -7312,6 +7430,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7358,6 +7477,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -7394,6 +7514,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -7440,6 +7561,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -8009,6 +8131,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -8039,6 +8162,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8068,6 +8192,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8094,6 +8219,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -8121,6 +8247,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -8157,6 +8284,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8203,6 +8331,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8274,6 +8403,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -8304,6 +8434,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8332,6 +8463,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8357,6 +8489,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -8383,6 +8516,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -8419,6 +8553,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -8465,6 +8600,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -8536,6 +8672,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -8566,6 +8703,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8595,6 +8733,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8621,6 +8760,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -8648,6 +8788,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -8684,6 +8825,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8730,6 +8872,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8801,6 +8944,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -8831,6 +8975,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8859,6 +9004,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8884,6 +9030,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -8910,6 +9057,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -8946,6 +9094,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -8992,6 +9141,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -9061,6 +9211,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -9210,6 +9361,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9257,6 +9409,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9301,6 +9454,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -9344,6 +9498,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -9394,6 +9549,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -9436,6 +9592,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -9497,6 +9654,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -9639,6 +9797,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9685,6 +9844,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9728,6 +9888,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -9770,6 +9931,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -9820,6 +9982,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -9862,6 +10025,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -10668,6 +10832,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -10817,6 +10982,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -10864,6 +11030,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10908,6 +11075,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -10951,6 +11119,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -11001,6 +11170,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -11043,6 +11213,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -11104,6 +11275,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -11246,6 +11418,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11292,6 +11465,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11335,6 +11509,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -11377,6 +11552,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -11427,6 +11603,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -11469,6 +11646,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -11531,6 +11709,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -11680,6 +11859,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11727,6 +11907,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11771,6 +11952,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -11814,6 +11996,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -11864,6 +12047,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -11906,6 +12090,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -11967,6 +12152,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -12109,6 +12295,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12155,6 +12342,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -12198,6 +12386,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -12240,6 +12429,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -12290,6 +12480,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -12332,6 +12523,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -12393,6 +12585,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -12535,6 +12728,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12581,6 +12775,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -12624,6 +12819,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -12666,6 +12862,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -12716,6 +12913,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -12758,6 +12956,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -12842,6 +13041,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12871,6 +13071,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -12899,6 +13100,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -12926,6 +13128,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -12953,6 +13156,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12980,6 +13184,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index cad4c39eaf39f..3cd73d604e556 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -41,6 +41,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -92,6 +93,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -120,6 +122,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -149,6 +152,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -216,6 +220,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -266,6 +271,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -293,6 +299,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -321,6 +328,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -781,6 +789,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -809,6 +818,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -842,6 +852,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -873,6 +884,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -901,6 +913,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -930,6 +943,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -959,6 +973,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -988,6 +1003,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1039,6 +1055,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1090,6 +1107,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1118,6 +1136,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1147,6 +1166,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1209,6 +1229,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1254,6 +1275,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1309,6 +1331,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1342,6 +1365,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1402,6 +1426,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1445,6 +1470,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1497,6 +1523,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1528,6 +1555,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2016,6 +2044,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2061,6 +2090,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2096,6 +2126,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2131,6 +2162,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2161,6 +2193,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -2194,6 +2227,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2227,6 +2261,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2260,6 +2295,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2304,6 +2340,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2349,6 +2386,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2404,6 +2442,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -2437,6 +2476,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2508,6 +2548,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -2559,6 +2600,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -2602,6 +2644,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -2640,6 +2683,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2727,6 +2771,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2765,6 +2810,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2800,6 +2846,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -2836,6 +2883,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2874,6 +2922,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2914,6 +2963,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2968,6 +3018,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -3018,6 +3069,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -3060,6 +3112,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -3097,6 +3150,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3182,6 +3236,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3219,6 +3274,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3253,6 +3309,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -3288,6 +3345,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3325,6 +3383,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3363,6 +3422,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4238,6 +4298,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -4300,6 +4361,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -4352,6 +4414,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -4499,6 +4562,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4542,6 +4606,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4582,6 +4647,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -4622,6 +4688,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -4666,6 +4733,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4707,6 +4775,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4761,6 +4830,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -4822,6 +4892,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -4873,6 +4944,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -5017,6 +5089,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5059,6 +5132,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5098,6 +5172,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -5137,6 +5212,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -5180,6 +5256,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5219,6 +5296,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6153,6 +6231,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6186,6 +6265,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6216,6 +6296,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -6249,6 +6330,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6280,6 +6362,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6308,6 +6391,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -6338,6 +6422,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -6376,6 +6461,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -6422,6 +6508,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -6473,6 +6560,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6504,6 +6592,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6533,6 +6622,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -6564,6 +6654,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6594,6 +6685,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6621,6 +6713,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -6650,6 +6743,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -6688,6 +6782,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6734,6 +6829,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -7403,6 +7499,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -7461,6 +7558,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -7515,6 +7613,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -7664,6 +7763,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7711,6 +7811,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -7755,6 +7856,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -7798,6 +7900,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -7848,6 +7951,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -7890,6 +7994,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -7937,6 +8042,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -7990,6 +8096,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -8041,6 +8148,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -8183,6 +8291,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8229,6 +8338,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8272,6 +8382,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -8314,6 +8425,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -8364,6 +8476,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -8406,6 +8519,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -9364,6 +9478,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -9415,6 +9530,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9445,6 +9561,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -9474,6 +9591,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 6275afd2c6994..4339672724cc5 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -41,6 +41,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -92,6 +93,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -120,6 +122,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -149,6 +152,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -216,6 +220,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -266,6 +271,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -293,6 +299,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -321,6 +328,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -781,6 +789,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -809,6 +818,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -842,6 +852,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -873,6 +884,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -901,6 +913,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -930,6 +943,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -959,6 +973,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -988,6 +1003,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1039,6 +1055,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1090,6 +1107,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1118,6 +1136,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1147,6 +1166,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1209,6 +1229,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1254,6 +1275,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1309,6 +1331,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1342,6 +1365,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1402,6 +1426,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1445,6 +1470,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1497,6 +1523,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1528,6 +1555,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2016,6 +2044,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2061,6 +2090,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2096,6 +2126,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2131,6 +2162,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2161,6 +2193,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -2194,6 +2227,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2227,6 +2261,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2260,6 +2295,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2304,6 +2340,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2349,6 +2386,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2404,6 +2442,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -2437,6 +2476,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2508,6 +2548,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -2559,6 +2600,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -2602,6 +2644,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -2640,6 +2683,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2727,6 +2771,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2765,6 +2810,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2800,6 +2846,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -2836,6 +2883,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2874,6 +2922,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2914,6 +2963,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2968,6 +3018,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -3018,6 +3069,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -3060,6 +3112,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -3097,6 +3150,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3182,6 +3236,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3219,6 +3274,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3253,6 +3309,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -3288,6 +3345,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3325,6 +3383,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3363,6 +3422,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4238,6 +4298,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -4300,6 +4361,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -4352,6 +4414,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -4499,6 +4562,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4542,6 +4606,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4582,6 +4647,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -4622,6 +4688,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -4666,6 +4733,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4707,6 +4775,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4761,6 +4830,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -4822,6 +4892,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -4873,6 +4944,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -5017,6 +5089,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5059,6 +5132,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5098,6 +5172,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -5137,6 +5212,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -5180,6 +5256,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5219,6 +5296,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6153,6 +6231,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6186,6 +6265,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6216,6 +6296,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -6249,6 +6330,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6280,6 +6362,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6308,6 +6391,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -6338,6 +6422,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -6376,6 +6461,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -6422,6 +6508,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -6473,6 +6560,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6504,6 +6592,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6533,6 +6622,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -6564,6 +6654,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6594,6 +6685,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6621,6 +6713,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -6650,6 +6743,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -6688,6 +6782,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6734,6 +6829,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -7403,6 +7499,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -7461,6 +7558,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -7515,6 +7613,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -7664,6 +7763,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7711,6 +7811,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -7755,6 +7856,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -7798,6 +7900,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -7848,6 +7951,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -7890,6 +7994,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -7937,6 +8042,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -7990,6 +8096,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -8041,6 +8148,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -8183,6 +8291,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8229,6 +8338,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8272,6 +8382,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -8314,6 +8425,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -8364,6 +8476,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -8406,6 +8519,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -9364,6 +9478,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -9415,6 +9530,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9445,6 +9561,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -9474,6 +9591,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 3c991cfb7a1aa..1ff0bce7685c0 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -43,6 +43,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SDAG-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SDAG-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SDAG-NEXT: .p2align ; SDAG-NEXT: .LBB0_1: ; %load-store-loop ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: s_add_i32 s9, s20, s8 @@ -154,6 +155,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GISEL-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: .p2align ; GISEL-NEXT: .LBB0_1: ; %load-store-loop ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_add_u32_e32 v46, s20, v1 @@ -255,6 +257,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-GFX942-NEXT: s_mov_b32 s17, s10 ; SDAG-GFX942-NEXT: s_mov_b32 s2, s9 ; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17] +; SDAG-GFX942-NEXT: .p2align ; SDAG-GFX942-NEXT: .LBB0_1: ; %load-store-loop ; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16 @@ -350,6 +353,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-GFX1100-NEXT: s_mov_b32 s3, s16 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17] +; SDAG-GFX1100-NEXT: .p2align ; SDAG-GFX1100-NEXT: .LBB0_1: ; %load-store-loop ; SDAG-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX1100-NEXT: s_add_i32 s1, s0, s16 @@ -438,6 +442,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3] ; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x2000 ; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GISEL-GFX942-NEXT: .p2align ; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop ; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1 @@ -510,6 +515,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX1100-NEXT: s_or_b64 s[12:13], s[16:17], s[2:3] ; GISEL-GFX1100-NEXT: s_mov_b32 s16, s11 ; GISEL-GFX1100-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] +; GISEL-GFX1100-NEXT: .p2align ; GISEL-GFX1100-NEXT: .LBB0_1: ; %load-store-loop ; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0 @@ -595,6 +601,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SDAG-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SDAG-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SDAG-NEXT: .p2align ; SDAG-NEXT: .LBB1_1: ; %load-store-loop ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_add_u32_e32 v45, s20, v0 @@ -698,6 +705,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GISEL-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: .p2align ; GISEL-NEXT: .LBB1_1: ; %load-store-loop ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_add_u32_e32 v45, s20, v0 @@ -801,6 +809,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX942-NEXT: s_mov_b32 s13, s10 ; SDAG-GFX942-NEXT: s_mov_b32 s2, s9 ; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[12:13] +; SDAG-GFX942-NEXT: .p2align ; SDAG-GFX942-NEXT: .LBB1_1: ; %load-store-loop ; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX942-NEXT: v_add_u32_e32 v1, s0, v0 @@ -874,6 +883,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[12:13] +; SDAG-GFX1100-NEXT: .p2align ; SDAG-GFX1100-NEXT: .LBB1_1: ; %load-store-loop ; SDAG-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0 @@ -958,6 +968,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3] ; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GISEL-GFX942-NEXT: .p2align ; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop ; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s0, v0 @@ -1033,6 +1044,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX1100-NEXT: s_or_b64 s[12:13], s[16:17], s[2:3] ; GISEL-GFX1100-NEXT: s_mov_b32 s16, s11 ; GISEL-GFX1100-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] +; GISEL-GFX1100-NEXT: .p2align ; GISEL-GFX1100-NEXT: .LBB1_1: ; %load-store-loop ; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll index 07816f1ed6a65..212f9933f3bbc 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll @@ -24,6 +24,7 @@ define void @f(i32 %arg, ptr %ptr) { ; ISA-NEXT: v_cvt_f32_ubyte0_e32 v4, s6 ; ISA-NEXT: v_cvt_f32_i32_e32 v5, s5 ; ISA-NEXT: s_mov_b32 s4, 0 +; ISA-NEXT: .p2align ; ISA-NEXT: .LBB0_1: ; %bb14 ; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 ; ISA-NEXT: v_mov_b32_e32 v7, v6 diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index b5352bef50b1e..08aaba8f8b691 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -15,6 +15,7 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-NEXT: s_addk_i32 s0, 0x80 ; GCN-NEXT: s_and_b64 vcc, exec, -1 ; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_2: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -53,6 +54,7 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm +; GCN-DBG-NEXT: .p2align ; GCN_DBG-NEXT: .LBB0_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -113,6 +115,7 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN-NEXT: s_addk_i32 s0, 0x80 ; GCN-NEXT: s_and_b64 vcc, exec, -1 ; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB1_1: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -376,6 +379,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], -1 ; GCN-NEXT: s_add_i32 s0, s4, 0x80 ; GCN-NEXT: s_and_b64 vcc, exec, s[2:3] +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB4_1: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll index c7f7f30a5e6bd..6e0e41a07b99c 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll @@ -8,6 +8,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i32 inreg %val) { ; GFX67-LABEL: test_sink_smem_offset_400: ; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: .p2align ; GFX67-NEXT: .LBB0_1: ; %loop ; GFX67-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -20,6 +21,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3 ; ; GFX89-LABEL: test_sink_smem_offset_400: ; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: .p2align ; GFX89-NEXT: .LBB0_1: ; %loop ; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) @@ -32,6 +34,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3 ; ; GFX12-LABEL: test_sink_smem_offset_400: ; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB0_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -62,6 +65,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_add_u32 s0, s0, 0xfa0 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: .p2align ; GFX6-NEXT: .LBB1_1: ; %loop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -74,6 +78,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i ; ; GFX7-LABEL: test_sink_smem_offset_4000: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB1_1: ; %loop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -86,6 +91,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i ; ; GFX89-LABEL: test_sink_smem_offset_4000: ; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: .p2align ; GFX89-NEXT: .LBB1_1: ; %loop ; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) @@ -98,6 +104,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i ; ; GFX12-LABEL: test_sink_smem_offset_4000: ; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB1_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -128,6 +135,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr ; GFX689: ; %bb.0: ; %entry ; GFX689-NEXT: s_add_u32 s0, s0, 0x3d0900 ; GFX689-NEXT: s_addc_u32 s1, s1, 0 +; GFX689-NEXT: .p2align ; GFX689-NEXT: .LBB2_1: ; %loop ; GFX689-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX689-NEXT: s_waitcnt lgkmcnt(0) @@ -140,6 +148,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr ; ; GFX7-LABEL: test_sink_smem_offset_4000000: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB2_1: ; %loop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -152,6 +161,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr ; ; GFX12-LABEL: test_sink_smem_offset_4000000: ; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB2_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -182,6 +192,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt ; GFX689: ; %bb.0: ; %entry ; GFX689-NEXT: s_add_u32 s0, s0, 0x2625a00 ; GFX689-NEXT: s_addc_u32 s1, s1, 0 +; GFX689-NEXT: .p2align ; GFX689-NEXT: .LBB3_1: ; %loop ; GFX689-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX689-NEXT: s_waitcnt lgkmcnt(0) @@ -194,6 +205,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt ; ; GFX7-LABEL: test_sink_smem_offset_40000000: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB3_1: ; %loop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -207,6 +219,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt ; GFX12-LABEL: test_sink_smem_offset_40000000: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x2625a00 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB3_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -237,6 +250,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg ; GFX6789: ; %bb.0: ; %entry ; GFX6789-NEXT: s_add_u32 s0, s0, 0x502f9000 ; GFX6789-NEXT: s_addc_u32 s1, s1, 9 +; GFX6789-NEXT: .p2align ; GFX6789-NEXT: .LBB4_1: ; %loop ; GFX6789-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6789-NEXT: s_waitcnt lgkmcnt(0) @@ -253,6 +267,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg ; GFX12-NEXT: s_mov_b32 s5, 9 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB4_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -283,6 +298,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr, ; GFX6789: ; %bb.0: ; %entry ; GFX6789-NEXT: s_add_u32 s0, s0, 0xfffffe70 ; GFX6789-NEXT: s_addc_u32 s1, s1, -1 +; GFX6789-NEXT: .p2align ; GFX6789-NEXT: .LBB5_1: ; %loop ; GFX6789-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6789-NEXT: s_waitcnt lgkmcnt(0) @@ -299,6 +315,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr, ; GFX12-NEXT: s_mov_b32 s5, -1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB5_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -330,6 +347,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400_32bit(ptr addrspace(6) inreg ; GFX6789: ; %bb.0: ; %entry ; GFX6789-NEXT: s_add_i32 s2, s0, 0xfffffe70 ; GFX6789-NEXT: s_mov_b32 s3, 0 +; GFX6789-NEXT: .p2align ; GFX6789-NEXT: .LBB6_1: ; %loop ; GFX6789-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6789-NEXT: s_waitcnt lgkmcnt(0) @@ -344,6 +362,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400_32bit(ptr addrspace(6) inreg ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_add_co_i32 s2, s0, 0xfffffe70 ; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB6_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll b/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll index d07cc84865bea..4d1a82264632c 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll +++ b/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll @@ -20,6 +20,7 @@ define amdgpu_kernel void @hoge(i1 %c0, i1 %c1, i1 %c2, i1 %c3, i1 %c4) { ; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], -1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_1: ; %bb25 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll index 31c23b94a8de8..a69844bd5ea51 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll @@ -334,6 +334,7 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) { ; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], 0 ; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base ; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 +; DAGISEL-ASM-NEXT: .p2align ; DAGISEL-ASM-NEXT: .LBB11_3: ; %finally ; DAGISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1 ; DAGISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[6:7] @@ -365,6 +366,7 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) { ; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0 ; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 +; GISEL-ASM-NEXT: .p2align ; GISEL-ASM-NEXT: .LBB11_3: ; %finally ; GISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index c30ce8c8ed507..279b8dfcc5941 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -986,6 +986,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_branch .LBB5_3 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB5_1: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[12:13] diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll index 2558da401f89a..b454f6ea39c4c 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -14,6 +14,7 @@ define i32 @combine_add_zext_xor(i32 inreg %cond) { ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB0_2 +; GFX1010-NEXT: .p2align ; GFX1010-NEXT: .LBB0_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX1010-NEXT: s_xor_b32 s5, s5, -1 @@ -46,6 +47,7 @@ define i32 @combine_add_zext_xor(i32 inreg %cond) { ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB0_2 +; GFX1100-NEXT: .p2align ; GFX1100-NEXT: .LBB0_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -107,6 +109,7 @@ define i32 @combine_sub_zext_xor(i32 inreg %cond) { ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB1_2 +; GFX1010-NEXT: .p2align ; GFX1010-NEXT: .LBB1_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GFX1010-NEXT: s_xor_b32 s5, s5, -1 @@ -139,6 +142,7 @@ define i32 @combine_sub_zext_xor(i32 inreg %cond) { ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB1_2 +; GFX1100-NEXT: .p2align ; GFX1100-NEXT: .LBB1_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -200,6 +204,7 @@ define i32 @combine_add_zext_or(i32 inreg %cond) { ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB2_2 +; GFX1010-NEXT: .p2align ; GFX1010-NEXT: .LBB2_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GFX1010-NEXT: s_cmpk_gt_i32 s5, 0xfbe6 @@ -234,6 +239,7 @@ define i32 @combine_add_zext_or(i32 inreg %cond) { ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB2_2 +; GFX1100-NEXT: .p2align ; GFX1100-NEXT: .LBB2_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GFX1100-NEXT: s_cmpk_gt_i32 s1, 0xfbe6 @@ -297,6 +303,7 @@ define i32 @combine_sub_zext_or(i32 inreg %cond) { ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB3_2 +; GFX1010-NEXT: .p2align ; GFX1010-NEXT: .LBB3_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX1010-NEXT: s_cmpk_gt_i32 s5, 0xfbe6 @@ -331,6 +338,7 @@ define i32 @combine_sub_zext_or(i32 inreg %cond) { ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB3_2 +; GFX1100-NEXT: .p2align ; GFX1100-NEXT: .LBB3_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX1100-NEXT: s_cmpk_gt_i32 s1, 0xfbe6 @@ -394,6 +402,7 @@ define i32 @combine_add_zext_and(i32 inreg %cond) { ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB4_2 +; GFX1010-NEXT: .p2align ; GFX1010-NEXT: .LBB4_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 @@ -425,6 +434,7 @@ define i32 @combine_add_zext_and(i32 inreg %cond) { ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB4_2 +; GFX1100-NEXT: .p2align ; GFX1100-NEXT: .LBB4_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -486,6 +496,7 @@ define i32 @combine_sub_zext_and(i32 inreg %cond) { ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB5_2 +; GFX1010-NEXT: .p2align ; GFX1010-NEXT: .LBB5_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 @@ -517,6 +528,7 @@ define i32 @combine_sub_zext_and(i32 inreg %cond) { ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB5_2 +; GFX1100-NEXT: .p2align ; GFX1100-NEXT: .LBB5_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll index a13f3513c660e..f7de1c6241da0 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll @@ -6,6 +6,7 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a, ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_cmp_lt_u32 0, 16 ; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_set_gpr_idx_on 0, gpr_idx(DST) @@ -47,6 +48,7 @@ define void @phi_with_alloca_and_divergent_copy_to_reg(ptr addrspace(5) %diverge ; CHECK-NEXT: v_mov_b32_e32 v6, v1 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_lshrrev_b32_e64 v2, 6, s32 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB1_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll index 931a14473c340..cfd7b4ce8aea3 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll @@ -18,6 +18,7 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a, ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_mov_b32 s0, 0 ; GFX7-NEXT: s_mov_b32 s1, 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB0_1: ; %loop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 @@ -46,6 +47,7 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a, ; GFX8-NEXT: s_addc_u32 s89, s89, 0 ; GFX8-NEXT: s_mov_b32 s0, 0 ; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB0_1: ; %loop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index 12de3750640db..e1c1bf8e1a7b1 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -125,6 +125,7 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB5_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 @@ -427,6 +428,7 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB18_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 @@ -464,6 +466,7 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB19_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 747affa928601..d92da0c7b3c24 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -150,6 +150,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v15, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[4:5] @@ -1363,6 +1364,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-G-NEXT: .p2align ; GFX9-G-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] @@ -2430,6 +2432,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v17, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 @@ -3490,6 +3493,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 ; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-G-NEXT: .p2align ; GFX9-G-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 7ea98a16e3b84..fec0ea9d3914e 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -147,6 +147,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mov_b32_e32 v17, 0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0 ; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: .p2align ; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v19 @@ -344,6 +345,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: .p2align ; SDAG-NEXT: .LBB0_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 @@ -564,6 +566,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: .p2align ; GISEL-NEXT: .LBB0_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_lshrrev_b32_e32 v36, 31, v17 @@ -755,6 +758,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: .p2align ; GISEL-NEXT: .LBB0_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], 1 @@ -953,6 +957,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: .p2align ; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v17 @@ -1128,6 +1133,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc ; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: .p2align ; SDAG-NEXT: .LBB1_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 @@ -1320,6 +1326,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v20, s6 ; GISEL-NEXT: v_mov_b32_e32 v19, s5 ; GISEL-NEXT: v_mov_b32_e32 v18, s4 +; GISEL-NEXT: .p2align ; GISEL-NEXT: .LBB1_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v23 @@ -1493,6 +1500,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v22, s6 ; GISEL-NEXT: v_mov_b32_e32 v21, s5 ; GISEL-NEXT: v_mov_b32_e32 v20, s4 +; GISEL-NEXT: .p2align ; GISEL-NEXT: .LBB1_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], 1 @@ -1700,6 +1708,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v25, v9, v17, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v24, v8, v16, vcc ; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: .p2align ; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21 @@ -1895,6 +1904,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v23, v15, v9, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v22, v14, v8, vcc ; SDAG-NEXT: v_mov_b32_e32 v15, 0 +; SDAG-NEXT: .p2align ; SDAG-NEXT: .LBB2_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 @@ -2162,6 +2172,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: .p2align ; GISEL-NEXT: .LBB2_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21 @@ -2353,6 +2364,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: .p2align ; GISEL-NEXT: .LBB2_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 @@ -2583,6 +2595,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v27, v17, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v26, v16, v0, vcc ; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: .p2align ; SDAG-NEXT: .LBB3_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v23 @@ -2758,6 +2771,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v27, v19, v5, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v26, v18, v4, vcc ; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: .p2align ; SDAG-NEXT: .LBB3_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 @@ -2989,6 +3003,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v18, s6 ; GISEL-NEXT: v_mov_b32_e32 v17, s5 ; GISEL-NEXT: v_mov_b32_e32 v16, s4 +; GISEL-NEXT: .p2align ; GISEL-NEXT: .LBB3_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_lshrrev_b32_e32 v38, 31, v23 @@ -3162,6 +3177,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v18, s6 ; GISEL-NEXT: v_mov_b32_e32 v17, s5 ; GISEL-NEXT: v_mov_b32_e32 v16, s4 +; GISEL-NEXT: .p2align ; GISEL-NEXT: .LBB3_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[24:25], 1 diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll index 8c3d20ffb02fd..a13731f930aed 100644 --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -27,6 +27,7 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5 ; ISA-NEXT: ; implicit-def: $sgpr2_sgpr3 ; ISA-NEXT: s_branch .LBB0_3 +; ISA-NEXT: .p2align ; ISA-NEXT: .LBB0_1: ; %Flow1 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; ISA-NEXT: s_or_b64 exec, exec, s[4:5] @@ -132,6 +133,7 @@ define amdgpu_ps void @i1_copy_assert(i1 %v4) { ; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5 ; ISA-NEXT: ; implicit-def: $sgpr2_sgpr3 ; ISA-NEXT: s_branch .LBB1_3 +; ISA-NEXT: .p2align ; ISA-NEXT: .LBB1_1: ; %endif1 ; ISA-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; ISA-NEXT: s_andn2_b64 s[4:5], s[4:5], exec diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll index fd64ea3ae1c4b..5a04fcc4a884e 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll @@ -12,6 +12,7 @@ define i32 @rocrand_regression(ptr addrspace(1) %arg, i32 %arg0, i1 %cmp7) { ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_2 Depth 2 ; CHECK-NEXT: s_mov_b64 s[6:7], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_2: ; %while.cond ; CHECK-NEXT: ; Parent Loop BB0_1 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll index d03d53a8cbbaa..95a2bf2e5f784 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll @@ -13,6 +13,7 @@ define double @issue130646(i64 %arg) { ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_branch .LBB0_2 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_1: ; %for.body.5 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_lshr_b64 s[6:7], s[4:5], 1 @@ -95,6 +96,7 @@ define amdgpu_cs void @issue130119(i1 %arg) { ; CHECK-NEXT: s_mov_b64 s[10:11], 0 ; CHECK-NEXT: ; implicit-def: $sgpr8_sgpr9 ; CHECK-NEXT: s_branch .LBB1_4 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB1_3: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB1_4 Depth=2 ; CHECK-NEXT: s_xor_b64 s[14:15], s[14:15], -1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index ae5da3ad094c7..508a400960324 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -53,7 +53,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX10: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -127,7 +127,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX908: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -149,7 +149,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX8: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -171,7 +171,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -232,7 +232,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX10: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -311,7 +311,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX908: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -335,7 +335,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX8: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -358,7 +358,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -425,7 +425,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX10: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -509,7 +509,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX908: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -532,7 +532,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX8: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -555,7 +555,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7: .LBB2_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -615,7 +615,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX10: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -742,7 +742,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX8: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -763,7 +763,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -824,7 +824,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX10: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -957,7 +957,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -980,7 +980,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1048,7 +1048,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX10: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1181,7 +1181,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1204,7 +1204,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7: .LBB5_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1266,7 +1266,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -1347,7 +1347,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1371,7 +1371,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -1394,7 +1394,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -1457,7 +1457,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1592,7 +1592,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1615,7 +1615,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1664,7 +1664,7 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1690,7 +1690,7 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1713,7 +1713,7 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -1734,7 +1734,7 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1757,7 +1757,7 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1780,7 +1780,7 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1842,7 +1842,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1865,7 +1865,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -1886,7 +1886,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1909,7 +1909,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1932,7 +1932,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1994,7 +1994,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2127,7 +2127,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2150,7 +2150,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2199,7 +2199,7 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2225,7 +2225,7 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2248,7 +2248,7 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -2269,7 +2269,7 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2292,7 +2292,7 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2315,7 +2315,7 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2378,7 +2378,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -2452,7 +2452,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2474,7 +2474,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -2496,7 +2496,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -2557,7 +2557,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -2636,7 +2636,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2660,7 +2660,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -2683,7 +2683,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -2750,7 +2750,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -2834,7 +2834,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -2857,7 +2857,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -2880,7 +2880,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -2940,7 +2940,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3067,7 +3067,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3088,7 +3088,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3149,7 +3149,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3282,7 +3282,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3305,7 +3305,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3373,7 +3373,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3506,7 +3506,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3529,7 +3529,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3591,7 +3591,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -3672,7 +3672,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -3696,7 +3696,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -3719,7 +3719,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -3782,7 +3782,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3917,7 +3917,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3940,7 +3940,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4002,7 +4002,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -4083,7 +4083,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -4107,7 +4107,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -4130,7 +4130,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -4193,7 +4193,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX10: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4328,7 +4328,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4351,7 +4351,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX7: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4400,7 +4400,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -4426,7 +4426,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -4450,7 +4450,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -4472,7 +4472,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -4494,7 +4494,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -4516,7 +4516,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -4565,7 +4565,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4589,7 +4589,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4612,7 +4612,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -4633,7 +4633,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4654,7 +4654,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4675,7 +4675,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4723,7 +4723,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -4749,7 +4749,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -4773,7 +4773,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -4795,7 +4795,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -4817,7 +4817,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -4839,7 +4839,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -4888,7 +4888,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4912,7 +4912,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4935,7 +4935,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -4956,7 +4956,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4977,7 +4977,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4998,7 +4998,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5056,7 +5056,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -5130,7 +5130,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -5152,7 +5152,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -5174,7 +5174,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -5234,7 +5234,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5361,7 +5361,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5382,7 +5382,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5440,7 +5440,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -5464,7 +5464,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -5486,7 +5486,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -5508,7 +5508,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -5530,7 +5530,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -5590,7 +5590,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5613,7 +5613,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -5634,7 +5634,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5655,7 +5655,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5676,7 +5676,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5717,7 +5717,7 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB30_2: ; %atomicrmw.start +; GFX12: .LBB30_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -5827,7 +5827,7 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB30_2: ; %atomicrmw.start +; GFX11: .LBB30_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -5874,7 +5874,7 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB30_2: ; %atomicrmw.start +; GFX10: .LBB30_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -5991,7 +5991,7 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: .LBB30_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB30_4: ; %atomicrmw.start +; GFX908: .LBB30_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 @@ -6040,7 +6040,7 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v5, v[4:5] ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB30_2: ; %atomicrmw.start +; GFX8: .LBB30_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -6094,7 +6094,7 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: flat_load_dword v5, v[4:5] ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB30_2: ; %atomicrmw.start +; GFX7: .LBB30_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -6164,7 +6164,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB31_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB31_4: ; %atomicrmw.start +; GFX12: .LBB31_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -6278,7 +6278,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: .LBB31_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB31_4: ; %atomicrmw.start +; GFX11: .LBB31_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -6330,7 +6330,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: .LBB31_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB31_4: ; %atomicrmw.start +; GFX10: .LBB31_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v1 @@ -6443,7 +6443,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: .LBB31_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB31_4: ; %atomicrmw.start +; GFX908: .LBB31_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 @@ -6500,7 +6500,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB31_4: ; %atomicrmw.start +; GFX8: .LBB31_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -6558,7 +6558,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB31_4: ; %atomicrmw.start +; GFX7: .LBB31_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -6625,7 +6625,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB32_4: ; %atomicrmw.start +; GFX12: .LBB32_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -6740,7 +6740,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB32_4: ; %atomicrmw.start +; GFX11: .LBB32_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -6792,7 +6792,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB32_4: ; %atomicrmw.start +; GFX10: .LBB32_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v1 @@ -6905,7 +6905,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB32_4: ; %atomicrmw.start +; GFX908: .LBB32_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 @@ -6962,7 +6962,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB32_4: ; %atomicrmw.start +; GFX8: .LBB32_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -7020,7 +7020,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB32_4: ; %atomicrmw.start +; GFX7: .LBB32_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -7083,7 +7083,7 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB33_4: ; %atomicrmw.start +; GFX12: .LBB33_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] @@ -7188,7 +7188,7 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB33_4: ; %atomicrmw.start +; GFX11: .LBB33_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7236,7 +7236,7 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB33_4: ; %atomicrmw.start +; GFX10: .LBB33_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7342,7 +7342,7 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB33_4: ; %atomicrmw.start +; GFX908: .LBB33_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7396,7 +7396,7 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB33_4: ; %atomicrmw.start +; GFX8: .LBB33_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7451,7 +7451,7 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB33_4: ; %atomicrmw.start +; GFX7: .LBB33_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7516,7 +7516,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB34_4: ; %atomicrmw.start +; GFX12: .LBB34_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] @@ -7626,7 +7626,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB34_4: ; %atomicrmw.start +; GFX11: .LBB34_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7676,7 +7676,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB34_4: ; %atomicrmw.start +; GFX10: .LBB34_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7786,7 +7786,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB34_4: ; %atomicrmw.start +; GFX908: .LBB34_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7842,7 +7842,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB34_4: ; %atomicrmw.start +; GFX8: .LBB34_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7899,7 +7899,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB34_4: ; %atomicrmw.start +; GFX7: .LBB34_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7965,7 +7965,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB35_4: ; %atomicrmw.start +; GFX12: .LBB35_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] @@ -8076,7 +8076,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB35_4: ; %atomicrmw.start +; GFX11: .LBB35_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8126,7 +8126,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB35_4: ; %atomicrmw.start +; GFX10: .LBB35_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8236,7 +8236,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB35_4: ; %atomicrmw.start +; GFX908: .LBB35_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8292,7 +8292,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB35_4: ; %atomicrmw.start +; GFX8: .LBB35_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8349,7 +8349,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB35_4: ; %atomicrmw.start +; GFX7: .LBB35_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8408,7 +8408,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB36_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8452,7 +8452,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB36_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8492,7 +8492,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX942: .LBB36_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -8526,7 +8526,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB36_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8565,7 +8565,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB36_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8603,7 +8603,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8637,7 +8637,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -8669,7 +8669,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -8701,7 +8701,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8735,7 +8735,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -8781,7 +8781,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8826,7 +8826,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8868,7 +8868,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX942: .LBB37_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -8903,7 +8903,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB37_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8943,7 +8943,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB37_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8982,7 +8982,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -9017,7 +9017,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -9050,7 +9050,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -9083,7 +9083,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9118,7 +9118,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -9165,7 +9165,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9210,7 +9210,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB38_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9253,7 +9253,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX942: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -9288,7 +9288,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB38_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9328,7 +9328,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB38_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9367,7 +9367,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -9402,7 +9402,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -9435,7 +9435,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -9468,7 +9468,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9503,7 +9503,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -9549,7 +9549,7 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB39_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9592,7 +9592,7 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB39_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9630,7 +9630,7 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX942: .LBB39_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -9663,7 +9663,7 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB39_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9701,7 +9701,7 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB39_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9737,7 +9737,7 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9770,7 +9770,7 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -9801,7 +9801,7 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9832,7 +9832,7 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9865,7 +9865,7 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -9909,7 +9909,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9953,7 +9953,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9993,7 +9993,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX942: .LBB40_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -10027,7 +10027,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10066,7 +10066,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB40_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10103,7 +10103,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10137,7 +10137,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -10169,7 +10169,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10201,7 +10201,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10235,7 +10235,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10280,7 +10280,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10324,7 +10324,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10365,7 +10365,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX942: .LBB41_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -10399,7 +10399,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10438,7 +10438,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB41_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10475,7 +10475,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10509,7 +10509,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -10541,7 +10541,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10573,7 +10573,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10607,7 +10607,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10643,7 +10643,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB42_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l @@ -10675,7 +10675,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB42_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2 @@ -10704,7 +10704,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX942: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_f16_e32 v3, v5, v2 @@ -10727,7 +10727,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB42_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l @@ -10754,7 +10754,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB42_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2 @@ -10783,7 +10783,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 @@ -10809,7 +10809,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 @@ -10832,7 +10832,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 @@ -10856,7 +10856,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_e32 v3, v4, v2 @@ -10883,7 +10883,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -10917,7 +10917,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB43_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -10950,7 +10950,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB43_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -10981,7 +10981,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX942: .LBB43_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -11005,7 +11005,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB43_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -11033,7 +11033,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB43_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -11064,7 +11064,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -11090,7 +11090,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11114,7 +11114,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11139,7 +11139,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -11166,7 +11166,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -11210,7 +11210,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -11256,7 +11256,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB44_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -11299,7 +11299,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -11334,7 +11334,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB44_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -11374,7 +11374,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB44_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -11413,7 +11413,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -11448,7 +11448,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -11483,7 +11483,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -11516,7 +11516,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -11551,7 +11551,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -11598,7 +11598,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11643,7 +11643,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB45_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11684,7 +11684,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942: .LBB45_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -11718,7 +11718,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB45_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11757,7 +11757,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB45_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11794,7 +11794,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11828,7 +11828,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -11862,7 +11862,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11894,7 +11894,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11928,7 +11928,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11976,7 +11976,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB46_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12033,7 +12033,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB46_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12084,7 +12084,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -12225,7 +12225,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -12266,7 +12266,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -12305,7 +12305,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -12343,7 +12343,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -12383,7 +12383,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -12429,7 +12429,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB47_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12489,7 +12489,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB47_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12542,7 +12542,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -12687,7 +12687,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -12729,7 +12729,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -12769,7 +12769,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -12808,7 +12808,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -12849,7 +12849,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -12896,7 +12896,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB48_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12956,7 +12956,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB48_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -13010,7 +13010,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -13155,7 +13155,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -13197,7 +13197,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -13237,7 +13237,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -13276,7 +13276,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -13317,7 +13317,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -13364,7 +13364,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB49_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -13422,7 +13422,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB49_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13473,7 +13473,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13613,7 +13613,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13654,7 +13654,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13693,7 +13693,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13731,7 +13731,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13771,7 +13771,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13816,7 +13816,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB50_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -13874,7 +13874,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB50_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13926,7 +13926,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14066,7 +14066,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14107,7 +14107,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14146,7 +14146,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14184,7 +14184,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14224,7 +14224,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -14260,7 +14260,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB51_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -14307,7 +14307,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB51_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -14349,7 +14349,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -14464,7 +14464,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -14498,7 +14498,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14531,7 +14531,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14564,7 +14564,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14598,7 +14598,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -14633,7 +14633,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB52_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l @@ -14678,7 +14678,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB52_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14718,7 +14718,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14828,7 +14828,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14862,7 +14862,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14894,7 +14894,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14926,7 +14926,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14960,7 +14960,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15002,7 +15002,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB53_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -15057,7 +15057,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB53_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -15106,7 +15106,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -15242,7 +15242,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -15282,7 +15282,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -15320,7 +15320,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -15357,7 +15357,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -15396,7 +15396,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -15440,7 +15440,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -15501,7 +15501,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -15555,7 +15555,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -15700,7 +15700,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -15742,7 +15742,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -15784,7 +15784,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -15823,7 +15823,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -15864,7 +15864,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -15911,7 +15911,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -15970,7 +15970,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -16022,7 +16022,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -16162,7 +16162,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -16203,7 +16203,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -16244,7 +16244,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -16282,7 +16282,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -16322,7 +16322,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -16381,7 +16381,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -16407,7 +16407,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -16431,7 +16431,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -16453,7 +16453,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -16475,7 +16475,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -16507,7 +16507,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -16569,7 +16569,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -16597,7 +16597,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -16620,7 +16620,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -16642,7 +16642,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -16666,7 +16666,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -16699,7 +16699,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -16766,6 +16766,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v0, v[3:4] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16793,7 +16794,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX10: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -16821,7 +16822,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 @@ -16847,7 +16848,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX908: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -16870,7 +16871,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -16903,7 +16904,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -16964,7 +16965,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 @@ -16988,7 +16989,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX10: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -17011,7 +17012,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX90A: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 @@ -17032,7 +17033,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX908: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 @@ -17053,7 +17054,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -17084,7 +17085,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -17144,7 +17145,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 @@ -17170,7 +17171,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX10: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -17193,7 +17194,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX90A: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 @@ -17214,7 +17215,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX908: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 @@ -17237,7 +17238,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -17270,7 +17271,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -17337,6 +17338,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17363,7 +17365,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX10: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -17391,7 +17393,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 @@ -17417,7 +17419,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX908: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v0, v1, v2 @@ -17440,7 +17442,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -17473,7 +17475,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -17535,7 +17537,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX11: .LBB62_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -17563,7 +17565,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX10: .LBB62_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -17586,7 +17588,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX90A: .LBB62_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -17610,7 +17612,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX908: .LBB62_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -17634,7 +17636,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX8: .LBB62_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -17667,7 +17669,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX7: .LBB62_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -17729,7 +17731,7 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX11: .LBB63_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 @@ -17755,7 +17757,7 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX10: .LBB63_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -17778,7 +17780,7 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX90A: .LBB63_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 @@ -17801,7 +17803,7 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX908: .LBB63_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 @@ -17824,7 +17826,7 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX8: .LBB63_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -17857,7 +17859,7 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX7: .LBB63_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -17918,7 +17920,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX11: .LBB64_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -17944,7 +17946,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX10: .LBB64_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -17968,7 +17970,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX90A: .LBB64_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -17990,7 +17992,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX908: .LBB64_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -18012,7 +18014,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX8: .LBB64_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -18044,7 +18046,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX7: .LBB64_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -18106,7 +18108,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX11: .LBB65_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 @@ -18130,7 +18132,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX10: .LBB65_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -18153,7 +18155,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX90A: .LBB65_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 @@ -18174,7 +18176,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX908: .LBB65_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 @@ -18195,7 +18197,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX8: .LBB65_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -18226,7 +18228,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX7: .LBB65_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -18286,7 +18288,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX11: .LBB66_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -18312,7 +18314,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX10: .LBB66_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -18336,7 +18338,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX90A: .LBB66_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -18358,7 +18360,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX908: .LBB66_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -18380,7 +18382,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX8: .LBB66_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -18412,7 +18414,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX7: .LBB66_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -18474,7 +18476,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX11: .LBB67_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 @@ -18498,7 +18500,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX10: .LBB67_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -18521,7 +18523,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX90A: .LBB67_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 @@ -18542,7 +18544,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX908: .LBB67_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 @@ -18563,7 +18565,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX8: .LBB67_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -18594,7 +18596,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX7: .LBB67_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -18759,7 +18761,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX10: .LBB68_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -18801,7 +18803,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX90A: .LBB68_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -18841,7 +18843,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX908: .LBB68_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -18879,7 +18881,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX8: .LBB68_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -18925,7 +18927,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX7: .LBB68_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -19087,7 +19089,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX10: .LBB69_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -19128,7 +19130,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX90A: .LBB69_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -19168,7 +19170,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX908: .LBB69_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -19208,7 +19210,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX8: .LBB69_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -19255,7 +19257,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX7: .LBB69_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -19422,7 +19424,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX10: .LBB70_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -19468,7 +19470,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX90A: .LBB70_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -19512,7 +19514,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX908: .LBB70_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -19551,7 +19553,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX8: .LBB70_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -19598,7 +19600,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX7: .LBB70_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -19753,7 +19755,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX10: .LBB71_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19794,7 +19796,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX90A: .LBB71_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19833,7 +19835,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX908: .LBB71_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19870,7 +19872,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX8: .LBB71_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19915,7 +19917,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX7: .LBB71_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -20071,7 +20073,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX10: .LBB72_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20112,7 +20114,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX90A: .LBB72_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20151,7 +20153,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX908: .LBB72_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20190,7 +20192,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX8: .LBB72_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20237,7 +20239,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX7: .LBB72_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -20403,7 +20405,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX10: .LBB73_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20449,7 +20451,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX90A: .LBB73_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -20493,7 +20495,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX908: .LBB73_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -20532,7 +20534,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX8: .LBB73_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20579,7 +20581,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX7: .LBB73_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -20741,7 +20743,7 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX10: .LBB74_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -20782,7 +20784,7 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX90A: .LBB74_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -20824,7 +20826,7 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX908: .LBB74_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -20864,7 +20866,7 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX8: .LBB74_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -20911,7 +20913,7 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX7: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -21069,7 +21071,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX10: .LBB75_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21110,7 +21112,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX90A: .LBB75_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21151,7 +21153,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX908: .LBB75_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21190,7 +21192,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX8: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21237,7 +21239,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX7: .LBB75_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -21396,7 +21398,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX10: .LBB76_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -21438,7 +21440,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX90A: .LBB76_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -21478,7 +21480,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX908: .LBB76_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -21516,7 +21518,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX8: .LBB76_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -21562,7 +21564,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX7: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -21718,7 +21720,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX10: .LBB77_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21759,7 +21761,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX90A: .LBB77_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21798,7 +21800,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX908: .LBB77_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21835,7 +21837,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX8: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21880,7 +21882,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX7: .LBB77_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -22038,7 +22040,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX10: .LBB78_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -22080,7 +22082,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX90A: .LBB78_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -22120,7 +22122,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX908: .LBB78_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -22158,7 +22160,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX8: .LBB78_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -22204,7 +22206,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX7: .LBB78_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -22360,7 +22362,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX10: .LBB79_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22401,7 +22403,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX90A: .LBB79_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22440,7 +22442,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX908: .LBB79_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22477,7 +22479,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX8: .LBB79_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22522,7 +22524,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX7: .LBB79_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 6218a5c82afcd..106dc2b69dfd4 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -35,7 +35,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -80,7 +80,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -104,7 +104,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX908: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -128,7 +128,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX8: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -178,7 +178,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -225,7 +225,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX90A: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -249,7 +249,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX908: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -275,7 +275,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX8: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -335,7 +335,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX942: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v0 @@ -389,7 +389,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX90A: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -417,7 +417,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX908: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -442,7 +442,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX8: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -494,7 +494,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -540,7 +540,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -563,7 +563,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX908: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -586,7 +586,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX8: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -635,7 +635,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -683,7 +683,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -706,7 +706,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -731,7 +731,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -789,7 +789,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -845,7 +845,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX90A: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 @@ -873,7 +873,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 @@ -898,7 +898,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -951,7 +951,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX942: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -998,7 +998,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1024,7 +1024,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1050,7 +1050,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -1103,7 +1103,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX942: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -1151,7 +1151,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -1176,7 +1176,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -1201,7 +1201,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -1253,7 +1253,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1278,7 +1278,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -1306,7 +1306,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -1332,7 +1332,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1356,7 +1356,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1380,7 +1380,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1404,7 +1404,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -1446,7 +1446,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1491,7 +1491,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1515,7 +1515,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1539,7 +1539,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1593,7 +1593,7 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1638,7 +1638,7 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1662,7 +1662,7 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1686,7 +1686,7 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1736,7 +1736,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1783,7 +1783,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1807,7 +1807,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1833,7 +1833,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -1893,7 +1893,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v0 @@ -1947,7 +1947,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -1975,7 +1975,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -2000,7 +2000,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -2052,7 +2052,7 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2098,7 +2098,7 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2121,7 +2121,7 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2144,7 +2144,7 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2193,7 +2193,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2241,7 +2241,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2264,7 +2264,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2289,7 +2289,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2347,7 +2347,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2403,7 +2403,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2431,7 +2431,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2456,7 +2456,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2509,7 +2509,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -2556,7 +2556,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -2582,7 +2582,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2608,7 +2608,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -2661,7 +2661,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2709,7 +2709,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2734,7 +2734,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2759,7 +2759,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2813,7 +2813,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX12: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 @@ -2908,7 +2908,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX11: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 @@ -3043,7 +3043,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX908: .LBB18_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v3 @@ -3099,7 +3099,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v3, v[2:3] ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX8: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -3215,7 +3215,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX12: .LBB19_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -3314,7 +3314,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX11: .LBB19_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -3454,7 +3454,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX908: .LBB19_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 @@ -3514,7 +3514,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX8: .LBB19_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3627,7 +3627,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX12: .LBB20_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -3727,7 +3727,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX11: .LBB20_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -3867,7 +3867,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX908: .LBB20_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 @@ -3927,7 +3927,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX8: .LBB20_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -4036,7 +4036,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX12: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -4128,7 +4128,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX11: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -4260,7 +4260,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX908: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -4317,7 +4317,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v5, v[2:3] ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX8: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -4425,7 +4425,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start +; GFX12: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] @@ -4522,7 +4522,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start +; GFX11: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4660,7 +4660,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start +; GFX908: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4719,7 +4719,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start +; GFX8: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4830,7 +4830,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start +; GFX12: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] @@ -4928,7 +4928,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start +; GFX11: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5066,7 +5066,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start +; GFX908: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5125,7 +5125,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start +; GFX8: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5225,7 +5225,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX12: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 @@ -5320,7 +5320,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX11: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 @@ -5371,7 +5371,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX10: .LBB24_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v3 @@ -5433,7 +5433,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start +; GFX90A: .LBB24_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] @@ -5479,7 +5479,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX908: .LBB24_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v3 @@ -5535,7 +5535,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: flat_load_dword v3, v[2:3] ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX8: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -5592,7 +5592,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: flat_load_dword v3, v[2:3] ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX7: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -5653,7 +5653,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start +; GFX12: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 @@ -5748,7 +5748,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start +; GFX11: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 @@ -5883,7 +5883,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start +; GFX908: .LBB25_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v3 @@ -5939,7 +5939,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: flat_load_dword v3, v[2:3] ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start +; GFX8: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -6047,7 +6047,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6094,7 +6094,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6136,7 +6136,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -6172,7 +6172,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6214,7 +6214,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6254,7 +6254,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6290,7 +6290,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -6324,7 +6324,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -6358,7 +6358,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6393,7 +6393,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -6442,7 +6442,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6491,7 +6491,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6535,7 +6535,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -6574,7 +6574,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6617,7 +6617,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6658,7 +6658,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6695,7 +6695,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -6730,7 +6730,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -6765,7 +6765,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6801,7 +6801,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -6851,7 +6851,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6900,7 +6900,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6945,7 +6945,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX942: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -6984,7 +6984,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -7027,7 +7027,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB28_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -7068,7 +7068,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7105,7 +7105,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7140,7 +7140,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -7175,7 +7175,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -7211,7 +7211,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -7258,7 +7258,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB29_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7303,7 +7303,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB29_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7344,7 +7344,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX942: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7379,7 +7379,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB29_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7419,7 +7419,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB29_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7458,7 +7458,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7493,7 +7493,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7526,7 +7526,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7559,7 +7559,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7593,7 +7593,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -7640,7 +7640,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -7687,7 +7687,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7730,7 +7730,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX942: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7768,7 +7768,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -7809,7 +7809,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7849,7 +7849,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7885,7 +7885,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7919,7 +7919,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7953,7 +7953,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7988,7 +7988,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8036,7 +8036,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -8083,7 +8083,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8127,7 +8127,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX942: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8165,7 +8165,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -8206,7 +8206,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8246,7 +8246,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8282,7 +8282,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8316,7 +8316,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8350,7 +8350,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8385,7 +8385,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8422,7 +8422,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -8458,7 +8458,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -8491,7 +8491,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX942: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -8517,7 +8517,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -8548,7 +8548,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -8581,7 +8581,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -8609,7 +8609,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -8635,7 +8635,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -8662,7 +8662,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -8690,7 +8690,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -8726,7 +8726,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l @@ -8760,7 +8760,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 @@ -8792,7 +8792,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX942: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 @@ -8817,7 +8817,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l @@ -8846,7 +8846,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 @@ -8878,7 +8878,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 @@ -8906,7 +8906,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 @@ -8931,7 +8931,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 @@ -8957,7 +8957,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 @@ -8985,7 +8985,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -9031,7 +9031,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9081,7 +9081,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9126,7 +9126,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX942: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -9165,7 +9165,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9208,7 +9208,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB34_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9249,7 +9249,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -9286,7 +9286,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -9323,7 +9323,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -9358,7 +9358,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9394,7 +9394,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -9444,7 +9444,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -9492,7 +9492,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9536,7 +9536,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX942: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9574,7 +9574,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -9615,7 +9615,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9655,7 +9655,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9691,7 +9691,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9727,7 +9727,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9761,7 +9761,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9796,7 +9796,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9844,7 +9844,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB36_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9901,7 +9901,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB36_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9952,7 +9952,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX942: .LBB36_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -10093,7 +10093,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10134,7 +10134,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10173,7 +10173,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10211,7 +10211,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10251,7 +10251,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -10298,7 +10298,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -10358,7 +10358,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -10411,7 +10411,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX942: .LBB37_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -10556,7 +10556,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10598,7 +10598,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10638,7 +10638,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10677,7 +10677,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10718,7 +10718,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -10766,7 +10766,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -10826,7 +10826,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB38_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -10880,7 +10880,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX942: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -11025,7 +11025,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -11067,7 +11067,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -11107,7 +11107,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -11146,7 +11146,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -11187,7 +11187,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -11234,7 +11234,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB39_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11289,7 +11289,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB39_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11338,7 +11338,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX942: .LBB39_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11474,7 +11474,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11514,7 +11514,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11552,7 +11552,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11589,7 +11589,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11628,7 +11628,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11673,7 +11673,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11731,7 +11731,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11782,7 +11782,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX942: .LBB40_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11922,7 +11922,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11963,7 +11963,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12002,7 +12002,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12040,7 +12040,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12080,7 +12080,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -12126,7 +12126,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -12184,7 +12184,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -12236,7 +12236,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX942: .LBB41_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12376,7 +12376,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12417,7 +12417,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12456,7 +12456,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12494,7 +12494,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12534,7 +12534,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -12571,7 +12571,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB42_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -12618,7 +12618,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB42_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -12660,7 +12660,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX942: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -12775,7 +12775,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -12809,7 +12809,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -12842,7 +12842,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -12875,7 +12875,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -12909,7 +12909,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -12945,7 +12945,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB43_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l @@ -12990,7 +12990,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB43_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13030,7 +13030,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX942: .LBB43_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13140,7 +13140,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13174,7 +13174,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13206,7 +13206,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13238,7 +13238,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13272,7 +13272,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13316,7 +13316,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -13377,7 +13377,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB44_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -13431,7 +13431,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -13576,7 +13576,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -13618,7 +13618,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -13660,7 +13660,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -13699,7 +13699,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -13740,7 +13740,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -13788,7 +13788,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -13847,7 +13847,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB45_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13899,7 +13899,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942: .LBB45_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14039,7 +14039,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14080,7 +14080,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14121,7 +14121,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14159,7 +14159,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14199,7 +14199,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -14241,7 +14241,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14270,7 +14270,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -14296,7 +14296,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -14324,7 +14324,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -14350,7 +14350,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14374,7 +14374,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14399,7 +14399,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -14433,7 +14433,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -14477,7 +14477,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14506,7 +14506,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -14532,7 +14532,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -14562,7 +14562,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -14587,7 +14587,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14611,7 +14611,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14638,7 +14638,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14673,7 +14673,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -14716,7 +14716,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14753,7 +14753,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v1, v2, v2 -; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v0 @@ -14781,6 +14781,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v0, v[3:4] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14810,7 +14811,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -14840,7 +14841,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -14868,7 +14869,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -14894,7 +14895,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14929,7 +14930,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -14972,7 +14973,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -15000,7 +15001,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15025,7 +15026,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15052,7 +15053,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15077,7 +15078,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15100,7 +15101,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15124,7 +15125,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15157,7 +15158,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15199,7 +15200,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -15227,7 +15228,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15252,7 +15253,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15281,7 +15282,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15306,7 +15307,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15329,7 +15330,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15355,7 +15356,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15390,7 +15391,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15433,7 +15434,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -15467,7 +15468,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15495,6 +15496,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15524,7 +15526,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15554,7 +15556,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 @@ -15582,7 +15584,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 @@ -15608,7 +15610,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15643,7 +15645,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15686,7 +15688,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -15716,7 +15718,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -15742,7 +15744,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -15772,7 +15774,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -15797,7 +15799,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -15823,7 +15825,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -15850,7 +15852,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -15885,7 +15887,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -15928,7 +15930,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -15957,7 +15959,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15982,7 +15984,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -16011,7 +16013,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -16036,7 +16038,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -16061,7 +16063,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -16087,7 +16089,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -16122,7 +16124,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -16170,7 +16172,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16224,7 +16226,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16274,7 +16276,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -16413,7 +16415,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16455,7 +16457,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16495,7 +16497,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16533,7 +16535,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -16579,7 +16581,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16621,7 +16623,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16675,7 +16677,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16725,7 +16727,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -16866,7 +16868,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -16907,7 +16909,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16947,7 +16949,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16987,7 +16989,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -17034,7 +17036,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -17075,7 +17077,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -17129,7 +17131,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -17187,7 +17189,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX942: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v0 @@ -17330,7 +17332,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -17376,7 +17378,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -17420,7 +17422,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -17459,7 +17461,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -17506,7 +17508,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -17547,7 +17549,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17599,7 +17601,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17648,7 +17650,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX942: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17782,7 +17784,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17823,7 +17825,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17862,7 +17864,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17899,7 +17901,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17944,7 +17946,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -17984,7 +17986,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB58_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18036,7 +18038,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB58_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18085,7 +18087,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX942: .LBB58_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18221,7 +18223,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX10: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18262,7 +18264,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18301,7 +18303,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX908: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18340,7 +18342,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18387,7 +18389,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18428,7 +18430,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB59_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18480,7 +18482,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB59_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18535,7 +18537,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX942: .LBB59_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18677,7 +18679,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX10: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18723,7 +18725,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX90A: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -18767,7 +18769,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX908: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -18806,7 +18808,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18853,7 +18855,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18894,7 +18896,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB60_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -18949,7 +18951,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB60_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -19000,7 +19002,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX942: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -19141,7 +19143,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX10: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -19182,7 +19184,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX90A: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -19224,7 +19226,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX908: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -19264,7 +19266,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -19311,7 +19313,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -19352,7 +19354,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB61_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19405,7 +19407,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB61_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19455,7 +19457,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX942: .LBB61_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19591,7 +19593,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX10: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19632,7 +19634,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19673,7 +19675,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX908: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19712,7 +19714,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19759,7 +19761,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 6eafbb50e4bb9..356e14ea4fc18 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -35,7 +35,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -80,7 +80,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -104,7 +104,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX908: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -128,7 +128,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX8: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -178,7 +178,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -225,7 +225,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX90A: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -249,7 +249,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX908: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -275,7 +275,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX8: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -335,7 +335,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX942: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v0 @@ -389,7 +389,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX90A: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -417,7 +417,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX908: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -442,7 +442,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX8: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -494,7 +494,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -540,7 +540,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -563,7 +563,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX908: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -586,7 +586,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX8: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -635,7 +635,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -683,7 +683,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -706,7 +706,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -731,7 +731,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -789,7 +789,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -845,7 +845,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX90A: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 @@ -873,7 +873,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 @@ -898,7 +898,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -951,7 +951,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX942: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -998,7 +998,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1024,7 +1024,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1050,7 +1050,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -1103,7 +1103,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX942: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -1151,7 +1151,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -1176,7 +1176,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -1201,7 +1201,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -1253,7 +1253,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1278,7 +1278,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -1306,7 +1306,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -1332,7 +1332,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1356,7 +1356,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1380,7 +1380,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1404,7 +1404,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -1446,7 +1446,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1491,7 +1491,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1515,7 +1515,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1539,7 +1539,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1593,7 +1593,7 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1638,7 +1638,7 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1662,7 +1662,7 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1686,7 +1686,7 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1736,7 +1736,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1783,7 +1783,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1807,7 +1807,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1833,7 +1833,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -1893,7 +1893,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v0 @@ -1947,7 +1947,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -1975,7 +1975,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -2000,7 +2000,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -2052,7 +2052,7 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2098,7 +2098,7 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2121,7 +2121,7 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2144,7 +2144,7 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2193,7 +2193,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2241,7 +2241,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2264,7 +2264,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2289,7 +2289,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2347,7 +2347,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2403,7 +2403,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2431,7 +2431,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2456,7 +2456,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2509,7 +2509,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -2556,7 +2556,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -2582,7 +2582,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2608,7 +2608,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -2661,7 +2661,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2709,7 +2709,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2734,7 +2734,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2759,7 +2759,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2813,7 +2813,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX12: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 @@ -2908,7 +2908,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX11: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 @@ -3043,7 +3043,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX908: .LBB18_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v3 @@ -3099,7 +3099,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v3, v[2:3] ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX8: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -3215,7 +3215,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX12: .LBB19_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -3314,7 +3314,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX11: .LBB19_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -3454,7 +3454,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX908: .LBB19_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 @@ -3514,7 +3514,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX8: .LBB19_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3627,7 +3627,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX12: .LBB20_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -3727,7 +3727,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX11: .LBB20_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -3867,7 +3867,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX908: .LBB20_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 @@ -3927,7 +3927,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX8: .LBB20_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -4036,7 +4036,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX12: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -4128,7 +4128,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX11: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -4260,7 +4260,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX908: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -4317,7 +4317,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: flat_load_dword v5, v[2:3] ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX8: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -4425,7 +4425,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start +; GFX12: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] @@ -4522,7 +4522,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start +; GFX11: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4660,7 +4660,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start +; GFX908: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4719,7 +4719,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start +; GFX8: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4830,7 +4830,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start +; GFX12: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] @@ -4928,7 +4928,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start +; GFX11: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5066,7 +5066,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start +; GFX908: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5125,7 +5125,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start +; GFX8: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5225,7 +5225,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX12: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 @@ -5320,7 +5320,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX11: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 @@ -5371,7 +5371,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX10: .LBB24_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v3 @@ -5433,7 +5433,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start +; GFX90A: .LBB24_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] @@ -5479,7 +5479,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX908: .LBB24_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v3 @@ -5535,7 +5535,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: flat_load_dword v3, v[2:3] ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX8: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -5592,7 +5592,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: flat_load_dword v3, v[2:3] ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX7: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -5653,7 +5653,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start +; GFX12: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 @@ -5748,7 +5748,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start +; GFX11: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 @@ -5883,7 +5883,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start +; GFX908: .LBB25_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v3 @@ -5939,7 +5939,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: flat_load_dword v3, v[2:3] ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start +; GFX8: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -6047,7 +6047,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6094,7 +6094,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6136,7 +6136,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -6172,7 +6172,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6214,7 +6214,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6254,7 +6254,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6290,7 +6290,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -6324,7 +6324,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -6358,7 +6358,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6393,7 +6393,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -6442,7 +6442,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6491,7 +6491,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6535,7 +6535,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -6574,7 +6574,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6617,7 +6617,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6658,7 +6658,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6695,7 +6695,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -6730,7 +6730,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -6765,7 +6765,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6801,7 +6801,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -6851,7 +6851,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6900,7 +6900,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6945,7 +6945,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX942: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -6984,7 +6984,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -7027,7 +7027,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB28_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -7068,7 +7068,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7105,7 +7105,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7140,7 +7140,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -7175,7 +7175,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -7211,7 +7211,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -7258,7 +7258,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB29_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7303,7 +7303,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB29_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7344,7 +7344,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX942: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7379,7 +7379,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB29_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7419,7 +7419,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB29_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7458,7 +7458,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7493,7 +7493,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7526,7 +7526,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7559,7 +7559,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7593,7 +7593,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -7640,7 +7640,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -7687,7 +7687,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7730,7 +7730,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX942: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7768,7 +7768,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -7809,7 +7809,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7849,7 +7849,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7885,7 +7885,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7919,7 +7919,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7953,7 +7953,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7988,7 +7988,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8036,7 +8036,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -8083,7 +8083,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8127,7 +8127,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX942: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8165,7 +8165,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -8206,7 +8206,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8246,7 +8246,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8282,7 +8282,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8316,7 +8316,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8350,7 +8350,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8385,7 +8385,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8422,7 +8422,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -8458,7 +8458,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -8491,7 +8491,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX942: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -8517,7 +8517,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -8548,7 +8548,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -8581,7 +8581,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -8609,7 +8609,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -8635,7 +8635,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -8662,7 +8662,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -8690,7 +8690,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -8726,7 +8726,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l @@ -8760,7 +8760,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 @@ -8792,7 +8792,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX942: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 @@ -8817,7 +8817,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l @@ -8846,7 +8846,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 @@ -8878,7 +8878,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 @@ -8906,7 +8906,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 @@ -8931,7 +8931,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 @@ -8957,7 +8957,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 @@ -8985,7 +8985,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -9031,7 +9031,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9081,7 +9081,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9126,7 +9126,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX942: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -9165,7 +9165,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9208,7 +9208,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB34_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9249,7 +9249,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -9286,7 +9286,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -9323,7 +9323,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -9358,7 +9358,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9394,7 +9394,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -9444,7 +9444,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -9492,7 +9492,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9536,7 +9536,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX942: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9574,7 +9574,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -9615,7 +9615,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9655,7 +9655,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9691,7 +9691,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9727,7 +9727,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9761,7 +9761,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9796,7 +9796,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9844,7 +9844,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB36_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9901,7 +9901,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB36_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9952,7 +9952,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX942: .LBB36_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -10093,7 +10093,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10134,7 +10134,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10173,7 +10173,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10211,7 +10211,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10251,7 +10251,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -10298,7 +10298,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -10358,7 +10358,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -10411,7 +10411,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX942: .LBB37_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -10556,7 +10556,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10598,7 +10598,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10638,7 +10638,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10677,7 +10677,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10718,7 +10718,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -10766,7 +10766,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -10826,7 +10826,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB38_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -10880,7 +10880,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX942: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -11025,7 +11025,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -11067,7 +11067,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -11107,7 +11107,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -11146,7 +11146,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -11187,7 +11187,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -11234,7 +11234,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB39_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11289,7 +11289,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB39_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11338,7 +11338,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX942: .LBB39_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11474,7 +11474,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11514,7 +11514,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11552,7 +11552,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11589,7 +11589,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11628,7 +11628,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11673,7 +11673,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11731,7 +11731,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11782,7 +11782,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX942: .LBB40_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11922,7 +11922,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11963,7 +11963,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12002,7 +12002,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12040,7 +12040,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12080,7 +12080,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -12126,7 +12126,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -12184,7 +12184,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -12236,7 +12236,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX942: .LBB41_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12376,7 +12376,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12417,7 +12417,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12456,7 +12456,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12494,7 +12494,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12534,7 +12534,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -12571,7 +12571,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB42_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -12618,7 +12618,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB42_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -12660,7 +12660,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX942: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -12775,7 +12775,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -12809,7 +12809,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -12842,7 +12842,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -12875,7 +12875,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -12909,7 +12909,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -12945,7 +12945,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB43_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l @@ -12990,7 +12990,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB43_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13030,7 +13030,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX942: .LBB43_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13140,7 +13140,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13174,7 +13174,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13206,7 +13206,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13238,7 +13238,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13272,7 +13272,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13316,7 +13316,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -13377,7 +13377,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB44_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -13431,7 +13431,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -13576,7 +13576,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -13618,7 +13618,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -13660,7 +13660,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -13699,7 +13699,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -13740,7 +13740,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -13788,7 +13788,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -13847,7 +13847,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB45_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13899,7 +13899,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942: .LBB45_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14039,7 +14039,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14080,7 +14080,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14121,7 +14121,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14159,7 +14159,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14199,7 +14199,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -14241,7 +14241,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14270,7 +14270,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -14296,7 +14296,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -14324,7 +14324,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -14350,7 +14350,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14374,7 +14374,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14399,7 +14399,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -14433,7 +14433,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -14477,7 +14477,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14506,7 +14506,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -14532,7 +14532,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -14562,7 +14562,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -14587,7 +14587,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14611,7 +14611,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14638,7 +14638,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14673,7 +14673,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -14716,7 +14716,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14753,7 +14753,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v1, v2, v2 -; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v0 @@ -14781,6 +14781,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v0, v[3:4] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14810,7 +14811,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -14840,7 +14841,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -14868,7 +14869,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -14894,7 +14895,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14929,7 +14930,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -14972,7 +14973,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -15000,7 +15001,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15025,7 +15026,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15052,7 +15053,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15077,7 +15078,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15100,7 +15101,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15124,7 +15125,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15157,7 +15158,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15199,7 +15200,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -15227,7 +15228,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15252,7 +15253,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15281,7 +15282,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15306,7 +15307,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15329,7 +15330,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15355,7 +15356,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15390,7 +15391,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15433,7 +15434,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -15467,7 +15468,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15495,6 +15496,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15524,7 +15526,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15554,7 +15556,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 @@ -15582,7 +15584,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 @@ -15608,7 +15610,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15643,7 +15645,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15686,7 +15688,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -15716,7 +15718,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -15742,7 +15744,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -15772,7 +15774,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -15797,7 +15799,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -15823,7 +15825,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -15850,7 +15852,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -15885,7 +15887,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -15928,7 +15930,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -15957,7 +15959,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15982,7 +15984,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -16011,7 +16013,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -16036,7 +16038,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -16061,7 +16063,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -16087,7 +16089,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -16122,7 +16124,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -16170,7 +16172,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16224,7 +16226,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16274,7 +16276,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -16413,7 +16415,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16455,7 +16457,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16495,7 +16497,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16533,7 +16535,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -16579,7 +16581,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16621,7 +16623,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16675,7 +16677,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16725,7 +16727,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -16866,7 +16868,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -16907,7 +16909,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16947,7 +16949,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16987,7 +16989,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -17034,7 +17036,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -17075,7 +17077,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -17129,7 +17131,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -17187,7 +17189,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX942: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v0 @@ -17330,7 +17332,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -17376,7 +17378,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -17420,7 +17422,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -17459,7 +17461,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -17506,7 +17508,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -17547,7 +17549,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17599,7 +17601,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17648,7 +17650,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX942: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17782,7 +17784,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17823,7 +17825,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17862,7 +17864,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17899,7 +17901,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17944,7 +17946,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -17984,7 +17986,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB58_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18036,7 +18038,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB58_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18085,7 +18087,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX942: .LBB58_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18221,7 +18223,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX10: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18262,7 +18264,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18301,7 +18303,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX908: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18340,7 +18342,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18387,7 +18389,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18428,7 +18430,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB59_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18480,7 +18482,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB59_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18535,7 +18537,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX942: .LBB59_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18677,7 +18679,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX10: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18723,7 +18725,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX90A: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -18767,7 +18769,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX908: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -18806,7 +18808,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18853,7 +18855,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18894,7 +18896,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB60_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -18949,7 +18951,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB60_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -19000,7 +19002,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX942: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -19141,7 +19143,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX10: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -19182,7 +19184,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX90A: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -19224,7 +19226,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX908: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -19264,7 +19266,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -19311,7 +19313,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -19352,7 +19354,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB61_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19405,7 +19407,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB61_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19455,7 +19457,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX942: .LBB61_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19591,7 +19593,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX10: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19632,7 +19634,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19673,7 +19675,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX908: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19712,7 +19714,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19759,7 +19761,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 25f29c8c87c96..c122871b78908 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -24,7 +24,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX12: .LBB0_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -51,7 +51,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -74,7 +74,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11: .LBB0_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -100,7 +100,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX10: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -124,7 +124,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -146,7 +146,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX908: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -168,7 +168,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX8: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -190,7 +190,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -220,7 +220,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX12: .LBB1_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -247,7 +247,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -270,7 +270,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX11: .LBB1_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -298,7 +298,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX10: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -321,7 +321,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX90A: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -343,7 +343,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX908: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -367,7 +367,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX8: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -390,7 +390,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -420,7 +420,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX12: .LBB2_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -455,7 +455,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX942: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, v0 @@ -480,6 +480,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v0, v[3:4] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -507,7 +508,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX10: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -535,7 +536,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX90A: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 @@ -561,7 +562,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX908: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -584,7 +585,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX8: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -607,7 +608,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7: .LBB2_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -637,7 +638,7 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX12: .LBB3_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -662,7 +663,7 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -684,7 +685,7 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX11: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -708,7 +709,7 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX10: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -731,7 +732,7 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -752,7 +753,7 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX908: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -773,7 +774,7 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX8: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -794,7 +795,7 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -823,7 +824,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX12: .LBB4_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -848,7 +849,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -870,7 +871,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX11: .LBB4_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -896,7 +897,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX10: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -919,7 +920,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -940,7 +941,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -963,7 +964,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -986,7 +987,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1016,7 +1017,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1047,7 +1048,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -1072,6 +1073,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1098,7 +1100,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX10: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1126,7 +1128,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX90A: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1152,7 +1154,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1175,7 +1177,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1198,7 +1200,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7: .LBB5_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1228,7 +1230,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -1256,7 +1258,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX942: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1279,7 +1281,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -1307,7 +1309,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -1330,7 +1332,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1354,7 +1356,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1378,7 +1380,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -1401,7 +1403,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -1431,7 +1433,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1457,7 +1459,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX942: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -1479,7 +1481,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX11: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1505,7 +1507,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1528,7 +1530,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -1551,7 +1553,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1574,7 +1576,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1597,7 +1599,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1631,7 +1633,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -1658,7 +1660,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1681,7 +1683,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -1707,7 +1709,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -1731,7 +1733,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1753,7 +1755,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1775,7 +1777,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1797,7 +1799,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -1827,7 +1829,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -1854,7 +1856,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1877,7 +1879,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -1905,7 +1907,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -1928,7 +1930,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1950,7 +1952,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1974,7 +1976,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -1997,7 +1999,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -2027,7 +2029,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -2062,7 +2064,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, v0 @@ -2087,6 +2089,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v0, v[3:4] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2114,7 +2117,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -2142,7 +2145,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 @@ -2168,7 +2171,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -2191,7 +2194,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -2214,7 +2217,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -2244,7 +2247,7 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2269,7 +2272,7 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -2291,7 +2294,7 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2315,7 +2318,7 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2338,7 +2341,7 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -2359,7 +2362,7 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2380,7 +2383,7 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2401,7 +2404,7 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2430,7 +2433,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12: .LBB12_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2455,7 +2458,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -2477,7 +2480,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2503,7 +2506,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2526,7 +2529,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -2547,7 +2550,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2570,7 +2573,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2593,7 +2596,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2623,7 +2626,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2654,7 +2657,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -2679,6 +2682,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2705,7 +2709,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2733,7 +2737,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2759,7 +2763,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2782,7 +2786,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2805,7 +2809,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2835,7 +2839,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -2863,7 +2867,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -2886,7 +2890,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -2914,7 +2918,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -2937,7 +2941,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -2961,7 +2965,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2985,7 +2989,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -3008,7 +3012,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 @@ -3038,7 +3042,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3064,7 +3068,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -3086,7 +3090,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3112,7 +3116,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3135,7 +3139,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -3158,7 +3162,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3181,7 +3185,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3204,7 +3208,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3246,7 +3250,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX12: .LBB16_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3297,7 +3301,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX942-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX942-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX942: .LBB16_2: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] @@ -3344,7 +3348,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX11: .LBB16_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3391,7 +3395,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX10: .LBB16_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -3450,7 +3454,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX90A-NEXT: .LBB16_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.start +; GFX90A: .LBB16_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] @@ -3501,7 +3505,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX908-NEXT: .LBB16_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB16_4: ; %atomicrmw.start +; GFX908: .LBB16_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 @@ -3550,7 +3554,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX8-NEXT: flat_load_dword v5, v[4:5] ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX8: .LBB16_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -3604,7 +3608,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX7-NEXT: flat_load_dword v5, v[4:5] ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7: .LBB16_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -3674,7 +3678,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: .LBB17_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX12: .LBB17_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -3728,7 +3732,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX942-NEXT: .LBB17_3: ; %atomicrmw.global ; GFX942-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX942: .LBB17_4: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] @@ -3780,7 +3784,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX11-NEXT: .LBB17_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX11: .LBB17_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -3832,7 +3836,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX10-NEXT: .LBB17_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX10: .LBB17_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v1 @@ -3887,7 +3891,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX90A: .LBB17_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] @@ -3938,7 +3942,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX908-NEXT: .LBB17_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX908: .LBB17_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 @@ -3995,7 +3999,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX8: .LBB17_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -4053,7 +4057,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX7: .LBB17_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -4120,7 +4124,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB18_4: ; %atomicrmw.start +; GFX12: .LBB18_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -4175,7 +4179,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX942-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX942-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: .LBB18_4: ; %atomicrmw.start +; GFX942: .LBB18_4: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] @@ -4227,7 +4231,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB18_4: ; %atomicrmw.start +; GFX11: .LBB18_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 @@ -4279,7 +4283,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX10-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB18_4: ; %atomicrmw.start +; GFX10: .LBB18_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v1 @@ -4334,7 +4338,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.start +; GFX90A: .LBB18_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] @@ -4385,7 +4389,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX908-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB18_4: ; %atomicrmw.start +; GFX908: .LBB18_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 @@ -4442,7 +4446,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB18_4: ; %atomicrmw.start +; GFX8: .LBB18_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -4500,7 +4504,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB18_4: ; %atomicrmw.start +; GFX7: .LBB18_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -4563,7 +4567,7 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX12: .LBB19_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] @@ -4613,7 +4617,7 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX942-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX942: .LBB19_4: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4661,7 +4665,7 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX11-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX11: .LBB19_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4709,7 +4713,7 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX10-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX10: .LBB19_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4761,7 +4765,7 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX90A: .LBB19_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4809,7 +4813,7 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX908-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX908: .LBB19_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4863,7 +4867,7 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX8: .LBB19_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4918,7 +4922,7 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX7: .LBB19_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4983,7 +4987,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX12: .LBB20_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] @@ -5035,7 +5039,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX942-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX942: .LBB20_4: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5086,7 +5090,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX11-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX11: .LBB20_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5136,7 +5140,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX10-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX10: .LBB20_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5190,7 +5194,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX90A: .LBB20_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5240,7 +5244,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX908-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX908: .LBB20_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5296,7 +5300,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX8: .LBB20_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5353,7 +5357,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX7: .LBB20_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5419,7 +5423,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX12: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] @@ -5472,7 +5476,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX942-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX942: .LBB21_4: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5523,7 +5527,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX11: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5573,7 +5577,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX10-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX10: .LBB21_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5627,7 +5631,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX90A: .LBB21_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5677,7 +5681,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX908: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5733,7 +5737,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX8: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5790,7 +5794,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX7: .LBB21_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5849,7 +5853,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB22_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5893,7 +5897,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB22_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5933,7 +5937,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX942: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -5967,7 +5971,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB22_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6006,7 +6010,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB22_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6044,7 +6048,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6078,7 +6082,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -6110,7 +6114,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -6142,7 +6146,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6176,7 +6180,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -6222,7 +6226,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB23_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6267,7 +6271,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB23_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6309,7 +6313,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX942: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -6344,7 +6348,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB23_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6384,7 +6388,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB23_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6423,7 +6427,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6458,7 +6462,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -6491,7 +6495,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -6524,7 +6528,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6559,7 +6563,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -6606,7 +6610,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB24_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6651,7 +6655,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB24_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6694,7 +6698,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX942: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -6729,7 +6733,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB24_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6769,7 +6773,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB24_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6808,7 +6812,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6843,7 +6847,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -6876,7 +6880,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -6909,7 +6913,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6944,7 +6948,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -6990,7 +6994,7 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB25_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7033,7 +7037,7 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB25_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7071,7 +7075,7 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX942: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7104,7 +7108,7 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB25_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7142,7 +7146,7 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB25_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7178,7 +7182,7 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7211,7 +7215,7 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7242,7 +7246,7 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7273,7 +7277,7 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7306,7 +7310,7 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -7350,7 +7354,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7394,7 +7398,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7434,7 +7438,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7468,7 +7472,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7507,7 +7511,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7544,7 +7548,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7578,7 +7582,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7610,7 +7614,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7642,7 +7646,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7676,7 +7680,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7721,7 +7725,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7765,7 +7769,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7806,7 +7810,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7840,7 +7844,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7879,7 +7883,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7916,7 +7920,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7950,7 +7954,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7982,7 +7986,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8014,7 +8018,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8048,7 +8052,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8084,7 +8088,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -8117,7 +8121,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -8148,7 +8152,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX942: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -8172,7 +8176,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -8200,7 +8204,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB28_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -8231,7 +8235,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -8257,7 +8261,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -8281,7 +8285,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -8306,7 +8310,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -8333,7 +8337,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -8368,7 +8372,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB29_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l @@ -8400,7 +8404,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB29_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2 @@ -8429,7 +8433,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX942: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_sub_f16_e32 v3, v5, v2 @@ -8452,7 +8456,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB29_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l @@ -8479,7 +8483,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB29_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2 @@ -8508,7 +8512,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_f16_e32 v3, v4, v2 @@ -8534,7 +8538,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_sub_f16_e32 v3, v5, v2 @@ -8557,7 +8561,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_sub_f16_e32 v3, v4, v2 @@ -8581,7 +8585,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f16_e32 v3, v4, v2 @@ -8608,7 +8612,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -8651,7 +8655,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8697,7 +8701,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8740,7 +8744,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX942: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -8775,7 +8779,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8815,7 +8819,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8854,7 +8858,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8889,7 +8893,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -8924,7 +8928,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -8957,7 +8961,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8992,7 +8996,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -9039,7 +9043,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9084,7 +9088,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9125,7 +9129,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX942: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -9159,7 +9163,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9198,7 +9202,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9235,7 +9239,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9269,7 +9273,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -9303,7 +9307,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9335,7 +9339,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9369,7 +9373,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9417,7 +9421,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9474,7 +9478,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9525,7 +9529,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX942: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -9666,7 +9670,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -9707,7 +9711,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -9746,7 +9750,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -9784,7 +9788,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9824,7 +9828,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -9870,7 +9874,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9930,7 +9934,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9983,7 +9987,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX942: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -10128,7 +10132,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10170,7 +10174,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10210,7 +10214,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10249,7 +10253,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10290,7 +10294,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -10337,7 +10341,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -10397,7 +10401,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -10451,7 +10455,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX942: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -10596,7 +10600,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10638,7 +10642,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10678,7 +10682,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10717,7 +10721,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10758,7 +10762,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -10804,7 +10808,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10859,7 +10863,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10908,7 +10912,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX942: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11044,7 +11048,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11084,7 +11088,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11122,7 +11126,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11159,7 +11163,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11198,7 +11202,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11242,7 +11246,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB36_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11300,7 +11304,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB36_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11351,7 +11355,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX942: .LBB36_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11491,7 +11495,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11532,7 +11536,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11571,7 +11575,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11609,7 +11613,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11649,7 +11653,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11694,7 +11698,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11752,7 +11756,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11804,7 +11808,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX942: .LBB37_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11944,7 +11948,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11985,7 +11989,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12024,7 +12028,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12062,7 +12066,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12102,7 +12106,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -12138,7 +12142,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -12185,7 +12189,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB38_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -12227,7 +12231,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX942: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -12342,7 +12346,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -12376,7 +12380,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -12409,7 +12413,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -12442,7 +12446,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -12476,7 +12480,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -12511,7 +12515,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB39_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l @@ -12556,7 +12560,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB39_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12596,7 +12600,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX942: .LBB39_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12706,7 +12710,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12740,7 +12744,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12772,7 +12776,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12804,7 +12808,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12838,7 +12842,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12881,7 +12885,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12942,7 +12946,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12996,7 +13000,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX942: .LBB40_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -13141,7 +13145,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -13183,7 +13187,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -13225,7 +13229,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -13264,7 +13268,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -13305,7 +13309,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX7-NEXT: v_not_b32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 @@ -13352,7 +13356,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -13411,7 +13415,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13463,7 +13467,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX942: .LBB41_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13603,7 +13607,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13644,7 +13648,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13685,7 +13689,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13723,7 +13727,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13763,7 +13767,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13803,7 +13807,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12: .LBB42_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -13830,7 +13834,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX942: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -13853,7 +13857,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -13879,7 +13883,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -13903,7 +13907,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -13925,7 +13929,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -13947,7 +13951,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -13979,7 +13983,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -14022,7 +14026,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12: .LBB43_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14049,7 +14053,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX942: .LBB43_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -14072,7 +14076,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -14100,7 +14104,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -14123,7 +14127,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14145,7 +14149,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14169,7 +14173,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -14202,7 +14206,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -14244,7 +14248,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14279,7 +14283,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, v0 @@ -14304,6 +14308,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v0, v[3:4] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14331,7 +14336,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -14359,7 +14364,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 @@ -14385,7 +14390,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 @@ -14408,7 +14413,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -14441,7 +14446,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -14483,7 +14488,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14508,7 +14513,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942: .LBB45_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14530,7 +14535,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14554,7 +14559,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14577,7 +14582,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14598,7 +14603,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14619,7 +14624,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -14650,7 +14655,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -14691,7 +14696,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14716,7 +14721,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14738,7 +14743,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14764,7 +14769,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14787,7 +14792,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14808,7 +14813,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14831,7 +14836,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -14864,7 +14869,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -14906,7 +14911,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14937,7 +14942,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14962,6 +14967,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14988,7 +14994,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15016,7 +15022,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15042,7 +15048,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15065,7 +15071,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15098,7 +15104,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15140,7 +15146,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -15168,7 +15174,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -15191,7 +15197,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -15219,7 +15225,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -15242,7 +15248,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -15266,7 +15272,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -15290,7 +15296,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -15323,7 +15329,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -15365,7 +15371,7 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15391,7 +15397,7 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15413,7 +15419,7 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15439,7 +15445,7 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15462,7 +15468,7 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15485,7 +15491,7 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15508,7 +15514,7 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15541,7 +15547,7 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15589,7 +15595,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB50_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -15643,7 +15649,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB50_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -15693,7 +15699,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -15832,7 +15838,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -15874,7 +15880,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -15914,7 +15920,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -15952,7 +15958,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -15998,7 +16004,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16040,7 +16046,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB51_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16094,7 +16100,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB51_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16144,7 +16150,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -16285,7 +16291,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -16326,7 +16332,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16366,7 +16372,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16406,7 +16412,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -16453,7 +16459,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -16494,7 +16500,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB52_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16548,7 +16554,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB52_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16606,7 +16612,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v0 @@ -16749,7 +16755,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -16795,7 +16801,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -16839,7 +16845,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -16878,7 +16884,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -16925,7 +16931,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -16966,7 +16972,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB53_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17018,7 +17024,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB53_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17067,7 +17073,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17201,7 +17207,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17242,7 +17248,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17281,7 +17287,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17318,7 +17324,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17363,7 +17369,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -17403,7 +17409,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17455,7 +17461,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17504,7 +17510,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17640,7 +17646,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17681,7 +17687,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17720,7 +17726,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17759,7 +17765,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17806,7 +17812,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -17847,7 +17853,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17899,7 +17905,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17954,7 +17960,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18096,7 +18102,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18142,7 +18148,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -18186,7 +18192,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -18225,7 +18231,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18272,7 +18278,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18313,7 +18319,7 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -18368,7 +18374,7 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -18419,7 +18425,7 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX942: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -18560,7 +18566,7 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -18601,7 +18607,7 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -18643,7 +18649,7 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -18683,7 +18689,7 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -18730,7 +18736,7 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -18771,7 +18777,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18824,7 +18830,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18874,7 +18880,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX942: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19010,7 +19016,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19051,7 +19057,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19092,7 +19098,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19131,7 +19137,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19178,7 +19184,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll index e8efa859ce13f..665a2015720f2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll @@ -22,6 +22,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocap ; GCN-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GCN-NEXT: .p2align 5 ; GCN-NEXT: .LBB0_1: ; %bb3 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index 20795431b4cd8..49c59c40c766d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -2113,6 +2113,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { ; GFX1250-SDAG: ; %bb.0: ; %bb ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-SDAG-NEXT: .p2align ; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3 ; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 @@ -2131,6 +2132,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { ; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: .p2align ; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3 ; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -2168,6 +2170,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-SDAG: ; %bb.0: ; %bb ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-SDAG-NEXT: .p2align ; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3 ; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 @@ -2188,6 +2191,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: .p2align ; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3 ; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index e74ad3d62bea4..e84360bbced98 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -1063,7 +1063,7 @@ define void @flat_atomic_sub_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN1: .LBB30_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 @@ -1084,7 +1084,7 @@ define void @flat_atomic_sub_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN2: .LBB30_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 @@ -1105,7 +1105,7 @@ define void @flat_atomic_sub_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN3: .LBB30_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 @@ -1132,7 +1132,7 @@ define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN1: .LBB31_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 @@ -1155,7 +1155,7 @@ define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN2: .LBB31_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 @@ -1176,7 +1176,7 @@ define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN3: .LBB31_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 @@ -1202,7 +1202,7 @@ define i32 @flat_atomic_sub_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN1: .LBB32_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -1224,7 +1224,7 @@ define i32 @flat_atomic_sub_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN2: .LBB32_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -1246,7 +1246,7 @@ define i32 @flat_atomic_sub_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN3: .LBB32_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -1274,7 +1274,7 @@ define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN1: .LBB33_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -1297,7 +1297,7 @@ define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN2: .LBB33_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -1318,7 +1318,7 @@ define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN3: .LBB33_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -1347,7 +1347,7 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN1: .LBB34_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, s6, v3 @@ -1370,7 +1370,7 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN2: .LBB34_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 @@ -1393,7 +1393,7 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN3: .LBB34_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_subrev_u32_e32 v2, s6, v3 @@ -1422,7 +1422,7 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN1: .LBB35_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, s6, v3 @@ -1447,7 +1447,7 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN2: .LBB35_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 @@ -1470,7 +1470,7 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN3: .LBB35_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_subrev_u32_e32 v2, s6, v3 @@ -1500,7 +1500,7 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN1: .LBB36_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -1525,7 +1525,7 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN2: .LBB36_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -1550,7 +1550,7 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN3: .LBB36_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -1579,7 +1579,7 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN1: .LBB37_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -1604,7 +1604,7 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN2: .LBB37_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -1629,7 +1629,7 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN3: .LBB37_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -1725,7 +1725,7 @@ define void @flat_atomic_and_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN1: .LBB40_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1746,7 +1746,7 @@ define void @flat_atomic_and_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN2: .LBB40_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1767,7 +1767,7 @@ define void @flat_atomic_and_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN3: .LBB40_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1794,7 +1794,7 @@ define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN1: .LBB41_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1817,7 +1817,7 @@ define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN2: .LBB41_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1838,7 +1838,7 @@ define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN3: .LBB41_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1864,7 +1864,7 @@ define i32 @flat_atomic_and_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN1: .LBB42_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -1886,7 +1886,7 @@ define i32 @flat_atomic_and_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN2: .LBB42_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -1908,7 +1908,7 @@ define i32 @flat_atomic_and_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN3: .LBB42_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -1936,7 +1936,7 @@ define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN1: .LBB43_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -1959,7 +1959,7 @@ define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN2: .LBB43_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -1980,7 +1980,7 @@ define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN3: .LBB43_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -2009,7 +2009,7 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN1: .LBB44_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 @@ -2032,7 +2032,7 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN2: .LBB44_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 @@ -2055,7 +2055,7 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN3: .LBB44_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 @@ -2084,7 +2084,7 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN1: .LBB45_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 @@ -2109,7 +2109,7 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN2: .LBB45_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 @@ -2132,7 +2132,7 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN3: .LBB45_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 @@ -2162,7 +2162,7 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN1: .LBB46_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -2187,7 +2187,7 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN2: .LBB46_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -2212,7 +2212,7 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN3: .LBB46_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -2241,7 +2241,7 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN1: .LBB47_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -2266,7 +2266,7 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN2: .LBB47_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -2291,7 +2291,7 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN3: .LBB47_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -2387,6 +2387,7 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2409,6 +2410,7 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2431,6 +2433,7 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2459,6 +2462,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2483,6 +2487,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2505,6 +2510,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2532,6 +2538,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2555,6 +2562,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2578,6 +2586,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2607,6 +2616,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2631,6 +2641,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2653,6 +2664,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2683,6 +2695,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2707,6 +2720,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2731,6 +2745,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2761,6 +2776,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2787,6 +2803,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2811,6 +2828,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2842,6 +2860,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2868,6 +2887,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2894,6 +2914,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2924,6 +2945,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2950,6 +2972,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2976,6 +2999,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3005,6 +3029,7 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3029,6 +3054,7 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3051,6 +3077,7 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3080,6 +3107,7 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3104,6 +3132,7 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3126,6 +3155,7 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3158,7 +3188,7 @@ define void @flat_atomic_or_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN1: .LBB60_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 @@ -3179,7 +3209,7 @@ define void @flat_atomic_or_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN2: .LBB60_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 @@ -3200,7 +3230,7 @@ define void @flat_atomic_or_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN3: .LBB60_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 @@ -3227,7 +3257,7 @@ define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN1: .LBB61_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 @@ -3250,7 +3280,7 @@ define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN2: .LBB61_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 @@ -3271,7 +3301,7 @@ define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN3: .LBB61_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 @@ -3297,7 +3327,7 @@ define i32 @flat_atomic_or_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN1: .LBB62_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -3319,7 +3349,7 @@ define i32 @flat_atomic_or_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN2: .LBB62_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -3341,7 +3371,7 @@ define i32 @flat_atomic_or_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN3: .LBB62_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -3369,7 +3399,7 @@ define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN1: .LBB63_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -3392,7 +3422,7 @@ define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN2: .LBB63_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -3413,7 +3443,7 @@ define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN3: .LBB63_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -3442,7 +3472,7 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN1: .LBB64_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_or_b32_e32 v2, s6, v3 @@ -3465,7 +3495,7 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN2: .LBB64_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_or_b32_e32 v2, s6, v3 @@ -3488,7 +3518,7 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN3: .LBB64_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_or_b32_e32 v2, s6, v3 @@ -3517,7 +3547,7 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN1: .LBB65_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_or_b32_e32 v2, s6, v3 @@ -3542,7 +3572,7 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN2: .LBB65_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_or_b32_e32 v2, s6, v3 @@ -3565,7 +3595,7 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN3: .LBB65_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_or_b32_e32 v2, s6, v3 @@ -3595,7 +3625,7 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN1: .LBB66_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -3620,7 +3650,7 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN2: .LBB66_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -3645,7 +3675,7 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN3: .LBB66_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -3674,7 +3704,7 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN1: .LBB67_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -3699,7 +3729,7 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN2: .LBB67_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -3724,7 +3754,7 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN3: .LBB67_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -3820,7 +3850,7 @@ define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN1: .LBB70_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 @@ -3841,7 +3871,7 @@ define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN2: .LBB70_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 @@ -3862,7 +3892,7 @@ define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN3: .LBB70_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 @@ -3889,7 +3919,7 @@ define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN1: .LBB71_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 @@ -3912,7 +3942,7 @@ define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN2: .LBB71_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 @@ -3933,7 +3963,7 @@ define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN3: .LBB71_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 @@ -3959,7 +3989,7 @@ define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN1: .LBB72_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -3981,7 +4011,7 @@ define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN2: .LBB72_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -4003,7 +4033,7 @@ define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN3: .LBB72_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -4031,7 +4061,7 @@ define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN1: .LBB73_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -4054,7 +4084,7 @@ define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN2: .LBB73_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -4075,7 +4105,7 @@ define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN3: .LBB73_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -4104,7 +4134,7 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN1: .LBB74_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_xor_b32_e32 v2, s6, v3 @@ -4127,7 +4157,7 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN2: .LBB74_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_xor_b32_e32 v2, s6, v3 @@ -4150,7 +4180,7 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN3: .LBB74_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_xor_b32_e32 v2, s6, v3 @@ -4179,7 +4209,7 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN1: .LBB75_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_xor_b32_e32 v2, s6, v3 @@ -4204,7 +4234,7 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN2: .LBB75_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_xor_b32_e32 v2, s6, v3 @@ -4227,7 +4257,7 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN3: .LBB75_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_xor_b32_e32 v2, s6, v3 @@ -4257,7 +4287,7 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN1: .LBB76_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -4282,7 +4312,7 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN2: .LBB76_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -4307,7 +4337,7 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN3: .LBB76_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -4336,7 +4366,7 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN1: .LBB77_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -4361,7 +4391,7 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN2: .LBB77_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -4386,7 +4416,7 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN3: .LBB77_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -4482,6 +4512,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4503,6 +4534,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4524,6 +4556,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4551,6 +4584,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4574,6 +4608,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4595,6 +4630,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4621,6 +4657,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4643,6 +4680,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4665,6 +4703,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4693,6 +4732,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4716,6 +4756,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4737,6 +4778,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4766,6 +4808,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4789,6 +4832,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4812,6 +4856,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4841,6 +4886,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4866,6 +4912,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4889,6 +4936,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4919,6 +4967,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4944,6 +4993,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4969,6 +5019,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4998,6 +5049,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5023,6 +5075,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5048,6 +5101,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5084,6 +5138,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5114,6 +5169,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5142,6 +5198,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5180,6 +5237,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5215,6 +5273,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5248,6 +5307,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5288,6 +5348,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5316,6 +5377,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5344,6 +5406,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5379,6 +5442,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5412,6 +5476,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5445,6 +5510,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5546,6 +5612,7 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5567,6 +5634,7 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5588,6 +5656,7 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5615,6 +5684,7 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5638,6 +5708,7 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5659,6 +5730,7 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5685,6 +5757,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5707,6 +5780,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5729,6 +5803,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5757,6 +5832,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5780,6 +5856,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5801,6 +5878,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5830,6 +5908,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5853,6 +5932,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5876,6 +5956,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5905,6 +5986,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5930,6 +6012,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5953,6 +6036,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5983,6 +6067,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6008,6 +6093,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6033,6 +6119,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6062,6 +6149,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6087,6 +6175,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6112,6 +6201,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6148,6 +6238,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6178,6 +6269,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6206,6 +6298,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6244,6 +6337,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6279,6 +6373,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6312,6 +6407,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6353,6 +6449,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6386,6 +6483,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6419,6 +6517,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6520,6 +6619,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6541,6 +6641,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6562,6 +6663,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6589,6 +6691,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6612,6 +6715,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6633,6 +6737,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6659,6 +6764,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6681,6 +6787,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6703,6 +6810,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6731,6 +6839,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6754,6 +6863,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6775,6 +6885,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6804,6 +6915,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6827,6 +6939,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6850,6 +6963,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6879,6 +6993,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6904,6 +7019,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6927,6 +7043,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6957,6 +7074,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6982,6 +7100,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7007,6 +7126,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7036,6 +7156,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7061,6 +7182,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7086,6 +7208,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7182,6 +7305,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7203,6 +7327,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7224,6 +7349,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7251,6 +7377,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7274,6 +7401,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7295,6 +7423,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7321,6 +7450,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7343,6 +7473,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7365,6 +7496,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7393,6 +7525,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7416,6 +7549,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7437,6 +7571,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7466,6 +7601,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7489,6 +7625,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7512,6 +7649,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7541,6 +7679,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7566,6 +7705,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7589,6 +7729,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7619,6 +7760,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7644,6 +7786,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7669,6 +7812,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7698,6 +7842,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7723,6 +7868,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7748,6 +7894,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7784,6 +7931,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7814,6 +7962,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7842,6 +7991,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7880,6 +8030,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7915,6 +8066,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7948,6 +8100,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7984,6 +8137,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8008,6 +8162,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8032,6 +8187,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8066,6 +8222,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8099,6 +8256,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8132,6 +8290,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8233,7 +8392,7 @@ define void @flat_atomic_uinc_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB131_1: ; %atomicrmw.start +; GCN1: .LBB131_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 @@ -8256,7 +8415,7 @@ define void @flat_atomic_uinc_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB131_1: ; %atomicrmw.start +; GCN2: .LBB131_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 @@ -8279,7 +8438,7 @@ define void @flat_atomic_uinc_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB131_1: ; %atomicrmw.start +; GCN3: .LBB131_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 @@ -8308,7 +8467,7 @@ define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB132_1: ; %atomicrmw.start +; GCN1: .LBB132_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 @@ -8333,7 +8492,7 @@ define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB132_1: ; %atomicrmw.start +; GCN2: .LBB132_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 @@ -8356,7 +8515,7 @@ define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB132_1: ; %atomicrmw.start +; GCN3: .LBB132_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 @@ -8384,7 +8543,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB133_1: ; %atomicrmw.start +; GCN1: .LBB133_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -8408,7 +8567,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB133_1: ; %atomicrmw.start +; GCN2: .LBB133_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -8432,7 +8591,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB133_1: ; %atomicrmw.start +; GCN3: .LBB133_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -8462,7 +8621,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB134_1: ; %atomicrmw.start +; GCN1: .LBB134_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -8487,7 +8646,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB134_1: ; %atomicrmw.start +; GCN2: .LBB134_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -8510,7 +8669,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB134_1: ; %atomicrmw.start +; GCN3: .LBB134_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -8541,7 +8700,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB135_1: ; %atomicrmw.start +; GCN1: .LBB135_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v2, vcc, 1, v3 @@ -8566,7 +8725,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB135_1: ; %atomicrmw.start +; GCN2: .LBB135_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v2, vcc, 1, v3 @@ -8591,7 +8750,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB135_1: ; %atomicrmw.start +; GCN3: .LBB135_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_u32_e32 v2, 1, v3 @@ -8622,7 +8781,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB136_1: ; %atomicrmw.start +; GCN1: .LBB136_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v2, vcc, 1, v3 @@ -8649,7 +8808,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB136_1: ; %atomicrmw.start +; GCN2: .LBB136_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v2, vcc, 1, v3 @@ -8674,7 +8833,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB136_1: ; %atomicrmw.start +; GCN3: .LBB136_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_u32_e32 v2, 1, v3 @@ -8706,7 +8865,7 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB137_1: ; %atomicrmw.start +; GCN1: .LBB137_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -8733,7 +8892,7 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB137_1: ; %atomicrmw.start +; GCN2: .LBB137_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -8760,7 +8919,7 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB137_1: ; %atomicrmw.start +; GCN3: .LBB137_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -8791,7 +8950,7 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB138_1: ; %atomicrmw.start +; GCN1: .LBB138_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -8818,7 +8977,7 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB138_1: ; %atomicrmw.start +; GCN2: .LBB138_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -8845,7 +9004,7 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB138_1: ; %atomicrmw.start +; GCN3: .LBB138_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -8943,7 +9102,7 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB141_1: ; %atomicrmw.start +; GCN1: .LBB141_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 @@ -8968,7 +9127,7 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB141_1: ; %atomicrmw.start +; GCN2: .LBB141_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 @@ -8993,7 +9152,7 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB141_1: ; %atomicrmw.start +; GCN3: .LBB141_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 @@ -9024,7 +9183,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB142_1: ; %atomicrmw.start +; GCN1: .LBB142_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 @@ -9051,7 +9210,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB142_1: ; %atomicrmw.start +; GCN2: .LBB142_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 @@ -9076,7 +9235,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB142_1: ; %atomicrmw.start +; GCN3: .LBB142_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 @@ -9106,7 +9265,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB143_1: ; %atomicrmw.start +; GCN1: .LBB143_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -9132,7 +9291,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB143_1: ; %atomicrmw.start +; GCN2: .LBB143_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -9158,7 +9317,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB143_1: ; %atomicrmw.start +; GCN3: .LBB143_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -9190,7 +9349,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB144_1: ; %atomicrmw.start +; GCN1: .LBB144_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -9217,7 +9376,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB144_1: ; %atomicrmw.start +; GCN2: .LBB144_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -9242,7 +9401,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB144_1: ; %atomicrmw.start +; GCN3: .LBB144_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -9276,7 +9435,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s6 -; GCN1-NEXT: .LBB145_1: ; %atomicrmw.start +; GCN1: .LBB145_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3 @@ -9304,7 +9463,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s6 -; GCN2-NEXT: .LBB145_1: ; %atomicrmw.start +; GCN2: .LBB145_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3 @@ -9332,7 +9491,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s6 -; GCN3-NEXT: .LBB145_1: ; %atomicrmw.start +; GCN3: .LBB145_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -9366,7 +9525,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s6 -; GCN1-NEXT: .LBB146_1: ; %atomicrmw.start +; GCN1: .LBB146_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3 @@ -9396,7 +9555,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s6 -; GCN2-NEXT: .LBB146_1: ; %atomicrmw.start +; GCN2: .LBB146_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3 @@ -9424,7 +9583,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s6 -; GCN3-NEXT: .LBB146_1: ; %atomicrmw.start +; GCN3: .LBB146_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -9459,7 +9618,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s6 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB147_1: ; %atomicrmw.start +; GCN1: .LBB147_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v5, v0 @@ -9489,7 +9648,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s6 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB147_1: ; %atomicrmw.start +; GCN2: .LBB147_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v5, v0 @@ -9519,7 +9678,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s6 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB147_1: ; %atomicrmw.start +; GCN3: .LBB147_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v5, v0 @@ -9553,7 +9712,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s6 -; GCN1-NEXT: .LBB148_1: ; %atomicrmw.start +; GCN1: .LBB148_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v5, v0 @@ -9583,7 +9742,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s6 -; GCN2-NEXT: .LBB148_1: ; %atomicrmw.start +; GCN2: .LBB148_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v5, v0 @@ -9613,7 +9772,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s6 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB148_1: ; %atomicrmw.start +; GCN3: .LBB148_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v5, v0 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll index 757649ca592b3..f2652edef8893 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -462,7 +462,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s3, v3 @@ -489,7 +489,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, s3, v3 @@ -534,7 +534,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -566,7 +566,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -623,7 +623,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s3, v3 @@ -654,7 +654,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, s3, v3 @@ -707,7 +707,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -741,7 +741,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -796,7 +796,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s3, v3 @@ -823,7 +823,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, s3, v3 @@ -867,7 +867,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v1 @@ -899,7 +899,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v1 @@ -953,7 +953,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s3, v3 @@ -982,7 +982,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, s3, v3 @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -1119,7 +1119,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2 @@ -1193,7 +1193,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v8, v3 @@ -1226,7 +1226,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v8, v3 @@ -1284,7 +1284,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2 @@ -1316,7 +1316,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2 @@ -1370,7 +1370,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v8, v3 @@ -1405,7 +1405,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v8, v3 @@ -1461,7 +1461,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2 @@ -1489,7 +1489,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2 @@ -1534,7 +1534,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX7: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v8, v1 @@ -1567,7 +1567,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v8, v1 @@ -1622,7 +1622,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2 @@ -1652,7 +1652,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2 @@ -1703,7 +1703,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v8, v3 @@ -1736,7 +1736,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v8, v3 @@ -1792,7 +1792,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -1821,7 +1821,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -1868,7 +1868,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -1902,7 +1902,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -1961,7 +1961,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -2049,7 +2049,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -2085,7 +2085,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -2142,7 +2142,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -2171,7 +2171,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -2217,7 +2217,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -2251,7 +2251,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -2307,7 +2307,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -2338,7 +2338,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -2390,7 +2390,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -2424,7 +2424,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -2480,7 +2480,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -2509,7 +2509,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -2556,7 +2556,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -2590,7 +2590,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -2649,7 +2649,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -2682,7 +2682,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -2737,7 +2737,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -2773,7 +2773,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -2830,7 +2830,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -2859,7 +2859,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -2905,7 +2905,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -2939,7 +2939,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -2995,7 +2995,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -3026,7 +3026,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -3078,7 +3078,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -3112,7 +3112,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -3168,7 +3168,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -3197,7 +3197,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -3244,7 +3244,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -3278,7 +3278,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -3337,7 +3337,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -3370,7 +3370,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -3425,7 +3425,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -3461,7 +3461,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -3518,7 +3518,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -3547,7 +3547,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -3593,7 +3593,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -3627,7 +3627,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3683,7 +3683,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -3714,7 +3714,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -3766,7 +3766,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -3800,7 +3800,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -3856,7 +3856,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] @@ -3885,7 +3885,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] @@ -3932,7 +3932,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -3966,7 +3966,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -4025,7 +4025,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] @@ -4058,7 +4058,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] @@ -4113,7 +4113,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -4149,7 +4149,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -4206,7 +4206,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] @@ -4235,7 +4235,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] @@ -4281,7 +4281,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -4315,7 +4315,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -4371,7 +4371,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] @@ -4402,7 +4402,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] @@ -4454,7 +4454,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -4488,7 +4488,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -4542,7 +4542,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v1, s3, v3 @@ -4569,7 +4569,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v1, s3, v3 @@ -4614,7 +4614,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -4646,7 +4646,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -4703,7 +4703,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v1, s3, v3 @@ -4734,7 +4734,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v1, s3, v3 @@ -4787,7 +4787,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -4821,7 +4821,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -4876,7 +4876,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v1, s3, v3 @@ -4903,7 +4903,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v1, s3, v3 @@ -4947,7 +4947,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v1 @@ -4979,7 +4979,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v1 @@ -5033,7 +5033,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX7: .LBB62_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v1, s3, v3 @@ -5062,7 +5062,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX8: .LBB62_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v1, s3, v3 @@ -5112,7 +5112,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX7: .LBB63_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -5144,7 +5144,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX8: .LBB63_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -5740,7 +5740,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX7: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3 @@ -5767,7 +5767,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX8: .LBB74_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3 @@ -5812,7 +5812,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX7: .LBB75_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -5844,7 +5844,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX8: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -5901,7 +5901,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX7: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3 @@ -5932,7 +5932,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX8: .LBB76_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3 @@ -5985,7 +5985,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX7: .LBB77_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -6019,7 +6019,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX8: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -6074,7 +6074,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX7: .LBB78_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3 @@ -6101,7 +6101,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX8: .LBB78_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3 @@ -6145,7 +6145,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX7: .LBB79_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v1 @@ -6177,7 +6177,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX8: .LBB79_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v1 @@ -6231,7 +6231,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX7: .LBB80_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3 @@ -6260,7 +6260,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX8: .LBB80_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3 @@ -6310,7 +6310,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX7: .LBB81_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -6342,7 +6342,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX8: .LBB81_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -7764,7 +7764,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX7: .LBB107_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 @@ -7794,7 +7794,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX8: .LBB107_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 @@ -7842,7 +7842,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX7: .LBB108_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -7877,7 +7877,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX8: .LBB108_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -7937,7 +7937,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX7: .LBB109_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 @@ -7971,7 +7971,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX8: .LBB109_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 @@ -8027,7 +8027,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX7: .LBB110_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -8064,7 +8064,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX8: .LBB110_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -8122,7 +8122,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX7: .LBB111_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 @@ -8152,7 +8152,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX8: .LBB111_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 @@ -8197,7 +8197,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX7: .LBB112_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -8230,7 +8230,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX8: .LBB112_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -8287,7 +8287,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX7: .LBB113_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 @@ -8319,7 +8319,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX8: .LBB113_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 @@ -8372,7 +8372,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX7: .LBB114_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -8407,7 +8407,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX8: .LBB114_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v3 @@ -8466,7 +8466,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX7: .LBB115_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -8500,7 +8500,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX8: .LBB115_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -8552,7 +8552,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX7: .LBB116_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -8591,7 +8591,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX8: .LBB116_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -8655,7 +8655,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX7: .LBB117_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -8693,7 +8693,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX8: .LBB117_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -8753,7 +8753,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s9 ; GFX7-NEXT: v_mov_b32_e32 v5, s8 -; GFX7-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX7: .LBB118_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -8794,7 +8794,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: v_mov_b32_e32 v5, s8 -; GFX8-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX8: .LBB118_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -8856,7 +8856,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX7: .LBB119_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -8890,7 +8890,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX8: .LBB119_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -8939,7 +8939,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX7: .LBB120_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -8976,7 +8976,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX8: .LBB120_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 @@ -9037,7 +9037,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX7: .LBB121_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -9073,7 +9073,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX8: .LBB121_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -9130,7 +9130,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s9 ; GFX7-NEXT: v_mov_b32_e32 v5, s8 -; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX7: .LBB122_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v3 @@ -9169,7 +9169,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: v_mov_b32_e32 v5, s8 -; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX8: .LBB122_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v3 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 524100c5b7a25..05664cb8a1bc4 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -3645,7 +3645,7 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB30_4: ; %atomicrmw.start +; GCN1: .LBB30_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 @@ -3704,7 +3704,7 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB30_4: ; %atomicrmw.start +; GCN2: .LBB30_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 @@ -3758,7 +3758,7 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB30_4: ; %atomicrmw.start +; GCN3: .LBB30_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 @@ -3822,7 +3822,7 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB31_4: ; %atomicrmw.start +; GCN1: .LBB31_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 @@ -3883,7 +3883,7 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB31_4: ; %atomicrmw.start +; GCN2: .LBB31_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 @@ -3939,7 +3939,7 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB31_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB31_4: ; %atomicrmw.start +; GCN3: .LBB31_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 @@ -3997,7 +3997,7 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB32_2: ; %atomicrmw.start +; GCN1: .LBB32_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -4055,7 +4055,7 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB32_2: ; %atomicrmw.start +; GCN2: .LBB32_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -4116,7 +4116,7 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB32_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB32_4: ; %atomicrmw.start +; GCN3: .LBB32_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -4181,7 +4181,7 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB33_4: ; %atomicrmw.start +; GCN1: .LBB33_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4243,7 +4243,7 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB33_4: ; %atomicrmw.start +; GCN2: .LBB33_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4300,7 +4300,7 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB33_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB33_4: ; %atomicrmw.start +; GCN3: .LBB33_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -4368,7 +4368,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: .LBB34_4: ; %atomicrmw.start +; GCN1: .LBB34_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 @@ -4431,7 +4431,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: .LBB34_4: ; %atomicrmw.start +; GCN2: .LBB34_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 @@ -4486,7 +4486,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: .LBB34_4: ; %atomicrmw.start +; GCN3: .LBB34_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 @@ -4552,7 +4552,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: .LBB35_4: ; %atomicrmw.start +; GCN1: .LBB35_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 @@ -4617,7 +4617,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: .LBB35_4: ; %atomicrmw.start +; GCN2: .LBB35_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 @@ -4674,7 +4674,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: .LBB35_4: ; %atomicrmw.start +; GCN3: .LBB35_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 @@ -4733,7 +4733,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: .LBB36_2: ; %atomicrmw.start +; GCN1: .LBB36_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v8, v1 @@ -4794,7 +4794,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: .LBB36_2: ; %atomicrmw.start +; GCN2: .LBB36_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v8, v1 @@ -4847,7 +4847,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: .LBB36_2: ; %atomicrmw.start +; GCN3: .LBB36_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v8, v1 @@ -4911,7 +4911,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: .LBB37_2: ; %atomicrmw.start +; GCN1: .LBB37_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v8, v1 @@ -4974,7 +4974,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: .LBB37_2: ; %atomicrmw.start +; GCN2: .LBB37_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v8, v1 @@ -5029,7 +5029,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: .LBB37_2: ; %atomicrmw.start +; GCN3: .LBB37_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v8, v1 @@ -5356,7 +5356,7 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB40_4: ; %atomicrmw.start +; GCN1: .LBB40_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 @@ -5415,7 +5415,7 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB40_4: ; %atomicrmw.start +; GCN2: .LBB40_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 @@ -5469,7 +5469,7 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB40_4: ; %atomicrmw.start +; GCN3: .LBB40_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 @@ -5533,7 +5533,7 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB41_4: ; %atomicrmw.start +; GCN1: .LBB41_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 @@ -5594,7 +5594,7 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB41_4: ; %atomicrmw.start +; GCN2: .LBB41_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 @@ -5650,7 +5650,7 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB41_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB41_4: ; %atomicrmw.start +; GCN3: .LBB41_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 @@ -5708,7 +5708,7 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB42_2: ; %atomicrmw.start +; GCN1: .LBB42_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -5766,7 +5766,7 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB42_2: ; %atomicrmw.start +; GCN2: .LBB42_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -5827,7 +5827,7 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB42_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB42_4: ; %atomicrmw.start +; GCN3: .LBB42_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -5892,7 +5892,7 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB43_4: ; %atomicrmw.start +; GCN1: .LBB43_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5954,7 +5954,7 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB43_4: ; %atomicrmw.start +; GCN2: .LBB43_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6011,7 +6011,7 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB43_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB43_4: ; %atomicrmw.start +; GCN3: .LBB43_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -6078,7 +6078,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB44_4: ; %atomicrmw.start +; GCN1: .LBB44_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 @@ -6139,7 +6139,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB44_4: ; %atomicrmw.start +; GCN2: .LBB44_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 @@ -6192,7 +6192,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB44_4: ; %atomicrmw.start +; GCN3: .LBB44_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 @@ -6256,7 +6256,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB45_4: ; %atomicrmw.start +; GCN1: .LBB45_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 @@ -6319,7 +6319,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB45_4: ; %atomicrmw.start +; GCN2: .LBB45_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 @@ -6374,7 +6374,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: .LBB45_4: ; %atomicrmw.start +; GCN3: .LBB45_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 @@ -6431,7 +6431,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB46_2: ; %atomicrmw.start +; GCN1: .LBB46_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -6490,7 +6490,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB46_2: ; %atomicrmw.start +; GCN2: .LBB46_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -6541,7 +6541,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB46_2: ; %atomicrmw.start +; GCN3: .LBB46_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -6603,7 +6603,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB47_2: ; %atomicrmw.start +; GCN1: .LBB47_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -6664,7 +6664,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB47_2: ; %atomicrmw.start +; GCN2: .LBB47_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -6717,7 +6717,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v3, s35 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: .LBB47_2: ; %atomicrmw.start +; GCN3: .LBB47_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -7043,6 +7043,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB50_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7106,6 +7107,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB50_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7164,6 +7166,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB50_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7232,6 +7235,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB51_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7297,6 +7301,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB51_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7357,6 +7362,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB51_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB51_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7419,6 +7425,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB52_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7481,6 +7488,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB52_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7546,6 +7554,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB52_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB52_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7615,6 +7624,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB53_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7681,6 +7691,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB53_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7742,6 +7753,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB53_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB53_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7813,6 +7825,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB54_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7878,6 +7891,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB54_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7935,6 +7949,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB54_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8003,6 +8018,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB55_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8070,6 +8086,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB55_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8129,6 +8146,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB55_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8190,6 +8208,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8253,6 +8272,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8308,6 +8328,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8374,6 +8395,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8439,6 +8461,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8496,6 +8519,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v3, s35 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8565,6 +8589,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB58_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8630,6 +8655,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB58_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8690,6 +8716,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: .LBB58_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB58_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8760,6 +8787,7 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB59_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8826,6 +8854,7 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB59_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8887,6 +8916,7 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: .LBB59_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB59_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8958,7 +8988,7 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB60_4: ; %atomicrmw.start +; GCN1: .LBB60_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 @@ -9017,7 +9047,7 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB60_4: ; %atomicrmw.start +; GCN2: .LBB60_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 @@ -9071,7 +9101,7 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB60_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB60_4: ; %atomicrmw.start +; GCN3: .LBB60_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 @@ -9135,7 +9165,7 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB61_4: ; %atomicrmw.start +; GCN1: .LBB61_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 @@ -9196,7 +9226,7 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB61_4: ; %atomicrmw.start +; GCN2: .LBB61_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 @@ -9252,7 +9282,7 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB61_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB61_4: ; %atomicrmw.start +; GCN3: .LBB61_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 @@ -9310,7 +9340,7 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB62_2: ; %atomicrmw.start +; GCN1: .LBB62_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -9368,7 +9398,7 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB62_2: ; %atomicrmw.start +; GCN2: .LBB62_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -9429,7 +9459,7 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB62_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB62_4: ; %atomicrmw.start +; GCN3: .LBB62_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -9494,7 +9524,7 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB63_4: ; %atomicrmw.start +; GCN1: .LBB63_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -9556,7 +9586,7 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB63_4: ; %atomicrmw.start +; GCN2: .LBB63_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -9613,7 +9643,7 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB63_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB63_4: ; %atomicrmw.start +; GCN3: .LBB63_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -9680,7 +9710,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB64_4: ; %atomicrmw.start +; GCN1: .LBB64_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 @@ -9741,7 +9771,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB64_4: ; %atomicrmw.start +; GCN2: .LBB64_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 @@ -9794,7 +9824,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB64_4: ; %atomicrmw.start +; GCN3: .LBB64_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 @@ -9858,7 +9888,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB65_4: ; %atomicrmw.start +; GCN1: .LBB65_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 @@ -9921,7 +9951,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB65_4: ; %atomicrmw.start +; GCN2: .LBB65_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 @@ -9976,7 +10006,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: .LBB65_4: ; %atomicrmw.start +; GCN3: .LBB65_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 @@ -10033,7 +10063,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB66_2: ; %atomicrmw.start +; GCN1: .LBB66_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -10092,7 +10122,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB66_2: ; %atomicrmw.start +; GCN2: .LBB66_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -10143,7 +10173,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB66_2: ; %atomicrmw.start +; GCN3: .LBB66_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -10205,7 +10235,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB67_2: ; %atomicrmw.start +; GCN1: .LBB67_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -10266,7 +10296,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB67_2: ; %atomicrmw.start +; GCN2: .LBB67_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -10319,7 +10349,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v3, s35 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: .LBB67_2: ; %atomicrmw.start +; GCN3: .LBB67_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -10645,7 +10675,7 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB70_4: ; %atomicrmw.start +; GCN1: .LBB70_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -10704,7 +10734,7 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB70_4: ; %atomicrmw.start +; GCN2: .LBB70_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -10758,7 +10788,7 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB70_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB70_4: ; %atomicrmw.start +; GCN3: .LBB70_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -10822,7 +10852,7 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB71_4: ; %atomicrmw.start +; GCN1: .LBB71_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -10883,7 +10913,7 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB71_4: ; %atomicrmw.start +; GCN2: .LBB71_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -10939,7 +10969,7 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB71_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB71_4: ; %atomicrmw.start +; GCN3: .LBB71_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -10997,7 +11027,7 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB72_2: ; %atomicrmw.start +; GCN1: .LBB72_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -11055,7 +11085,7 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB72_2: ; %atomicrmw.start +; GCN2: .LBB72_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -11116,7 +11146,7 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB72_4: ; %atomicrmw.start +; GCN3: .LBB72_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -11181,7 +11211,7 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB73_4: ; %atomicrmw.start +; GCN1: .LBB73_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -11243,7 +11273,7 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB73_4: ; %atomicrmw.start +; GCN2: .LBB73_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -11300,7 +11330,7 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB73_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB73_4: ; %atomicrmw.start +; GCN3: .LBB73_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -11367,7 +11397,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB74_4: ; %atomicrmw.start +; GCN1: .LBB74_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -11428,7 +11458,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB74_4: ; %atomicrmw.start +; GCN2: .LBB74_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -11481,7 +11511,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB74_4: ; %atomicrmw.start +; GCN3: .LBB74_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -11545,7 +11575,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB75_4: ; %atomicrmw.start +; GCN1: .LBB75_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -11608,7 +11638,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB75_4: ; %atomicrmw.start +; GCN2: .LBB75_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -11663,7 +11693,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: .LBB75_4: ; %atomicrmw.start +; GCN3: .LBB75_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -11720,7 +11750,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB76_2: ; %atomicrmw.start +; GCN1: .LBB76_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -11779,7 +11809,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB76_2: ; %atomicrmw.start +; GCN2: .LBB76_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -11830,7 +11860,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB76_2: ; %atomicrmw.start +; GCN3: .LBB76_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -11892,7 +11922,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB77_2: ; %atomicrmw.start +; GCN1: .LBB77_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -11953,7 +11983,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB77_2: ; %atomicrmw.start +; GCN2: .LBB77_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -12006,7 +12036,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v3, s35 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: .LBB77_2: ; %atomicrmw.start +; GCN3: .LBB77_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -12332,6 +12362,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB80_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12391,6 +12422,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB80_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12445,6 +12477,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB80_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB80_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12509,6 +12542,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB81_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12570,6 +12604,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB81_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12626,6 +12661,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB81_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB81_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12684,6 +12720,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB82_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12742,6 +12779,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB82_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12803,6 +12841,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB82_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB82_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12868,6 +12907,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB83_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12930,6 +12970,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB83_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12987,6 +13028,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB83_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB83_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13056,6 +13098,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13122,6 +13165,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13180,6 +13224,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13249,6 +13294,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13317,6 +13363,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13377,6 +13424,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13439,6 +13487,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13503,6 +13552,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13559,6 +13609,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13626,6 +13677,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13692,6 +13744,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13750,6 +13803,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13828,6 +13882,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13899,6 +13954,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13969,6 +14025,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s3 ; GCN3-NEXT: v_mov_b32_e32 v7, s2 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14037,6 +14094,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_mov_b64 s[2:3], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14108,6 +14166,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b64 s[2:3], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14178,6 +14237,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s13 ; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14259,6 +14319,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14328,6 +14389,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14396,6 +14458,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s3 ; GCN3-NEXT: v_mov_b32_e32 v7, s2 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14461,6 +14524,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b64 s[2:3], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14530,6 +14594,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b64 s[2:3], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14598,6 +14663,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s13 ; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14931,6 +14997,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB94_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14990,6 +15057,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB94_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15044,6 +15112,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB94_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB94_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15108,6 +15177,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB95_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15169,6 +15239,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB95_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15225,6 +15296,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB95_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB95_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15283,6 +15355,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB96_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15341,6 +15414,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB96_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15402,6 +15476,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB96_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB96_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15467,6 +15542,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB97_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15529,6 +15605,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB97_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15586,6 +15663,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB97_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB97_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15655,6 +15733,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15721,6 +15800,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15779,6 +15859,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15848,6 +15929,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15916,6 +15998,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15976,6 +16059,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16038,6 +16122,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16102,6 +16187,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16158,6 +16244,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16225,6 +16312,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16291,6 +16379,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16349,6 +16438,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16427,6 +16517,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16498,6 +16589,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16568,6 +16660,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s3 ; GCN3-NEXT: v_mov_b32_e32 v7, s2 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16636,6 +16729,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_mov_b64 s[2:3], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16707,6 +16801,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_mov_b64 s[2:3], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16777,6 +16872,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s13 ; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16851,6 +16947,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_mov_b64 s[2:3], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16920,6 +17017,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_mov_b64 s[2:3], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16988,6 +17086,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s13 ; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17321,6 +17420,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB107_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17380,6 +17480,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB107_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17434,6 +17535,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB107_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB107_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17498,6 +17600,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB108_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17559,6 +17662,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB108_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17615,6 +17719,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB108_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB108_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17673,6 +17778,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB109_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17731,6 +17837,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB109_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17792,6 +17899,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB109_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB109_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17857,6 +17965,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB110_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17919,6 +18028,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB110_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17976,6 +18086,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB110_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB110_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18045,6 +18156,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18111,6 +18223,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18169,6 +18282,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18238,6 +18352,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18306,6 +18421,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18366,6 +18482,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18428,6 +18545,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18492,6 +18610,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18548,6 +18667,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18615,6 +18735,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18681,6 +18802,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18739,6 +18861,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19068,6 +19191,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB117_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19127,6 +19251,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB117_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19181,6 +19306,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB117_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB117_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19245,6 +19371,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB118_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19306,6 +19433,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB118_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19362,6 +19490,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB118_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB118_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19420,6 +19549,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB119_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19478,6 +19608,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB119_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19539,6 +19670,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB119_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB119_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19604,6 +19736,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB120_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19666,6 +19799,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB120_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19723,6 +19857,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB120_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB120_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19792,6 +19927,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19858,6 +19994,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19916,6 +20053,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19985,6 +20123,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20053,6 +20192,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20113,6 +20253,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20175,6 +20316,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20239,6 +20381,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20295,6 +20438,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20362,6 +20506,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20428,6 +20573,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20486,6 +20632,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20564,6 +20711,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20635,6 +20783,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20705,6 +20854,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s3 ; GCN3-NEXT: v_mov_b32_e32 v7, s2 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20773,6 +20923,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_mov_b64 s[2:3], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20844,6 +20995,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b64 s[2:3], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20914,6 +21066,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s13 ; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20991,6 +21144,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21056,6 +21210,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21120,6 +21275,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s3 ; GCN3-NEXT: v_mov_b32_e32 v7, s2 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21184,6 +21340,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b64 s[2:3], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: .p2align ; GCN1-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21253,6 +21410,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b64 s[2:3], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: .p2align ; GCN2-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21321,6 +21479,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s13 ; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: .p2align ; GCN3-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21654,7 +21813,7 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB131_4: ; %atomicrmw.start +; GCN1: .LBB131_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 @@ -21718,7 +21877,7 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB131_4: ; %atomicrmw.start +; GCN2: .LBB131_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 @@ -21777,7 +21936,7 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB131_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB131_4: ; %atomicrmw.start +; GCN3: .LBB131_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 @@ -21846,7 +22005,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB132_4: ; %atomicrmw.start +; GCN1: .LBB132_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 @@ -21912,7 +22071,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB132_4: ; %atomicrmw.start +; GCN2: .LBB132_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 @@ -21973,7 +22132,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB132_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB132_4: ; %atomicrmw.start +; GCN3: .LBB132_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 @@ -22036,7 +22195,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB133_2: ; %atomicrmw.start +; GCN1: .LBB133_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -22099,7 +22258,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB133_2: ; %atomicrmw.start +; GCN2: .LBB133_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -22165,7 +22324,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB133_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB133_4: ; %atomicrmw.start +; GCN3: .LBB133_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -22235,7 +22394,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB134_4: ; %atomicrmw.start +; GCN1: .LBB134_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -22302,7 +22461,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB134_4: ; %atomicrmw.start +; GCN2: .LBB134_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -22364,7 +22523,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB134_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB134_4: ; %atomicrmw.start +; GCN3: .LBB134_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -22436,7 +22595,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB135_4: ; %atomicrmw.start +; GCN1: .LBB135_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 @@ -22503,7 +22662,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB135_4: ; %atomicrmw.start +; GCN2: .LBB135_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 @@ -22562,7 +22721,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB135_4: ; %atomicrmw.start +; GCN3: .LBB135_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 @@ -22632,7 +22791,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB136_4: ; %atomicrmw.start +; GCN1: .LBB136_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 @@ -22701,7 +22860,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB136_4: ; %atomicrmw.start +; GCN2: .LBB136_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 @@ -22762,7 +22921,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: .LBB136_4: ; %atomicrmw.start +; GCN3: .LBB136_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 @@ -22825,7 +22984,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB137_2: ; %atomicrmw.start +; GCN1: .LBB137_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -22890,7 +23049,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB137_2: ; %atomicrmw.start +; GCN2: .LBB137_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -22947,7 +23106,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB137_2: ; %atomicrmw.start +; GCN3: .LBB137_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -23015,7 +23174,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB138_2: ; %atomicrmw.start +; GCN1: .LBB138_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -23082,7 +23241,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB138_2: ; %atomicrmw.start +; GCN2: .LBB138_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -23141,7 +23300,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_mov_b32_e32 v3, s35 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: .LBB138_2: ; %atomicrmw.start +; GCN3: .LBB138_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -23491,7 +23650,7 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[10:11], 0 -; GCN1-NEXT: .LBB141_4: ; %atomicrmw.start +; GCN1: .LBB141_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -23558,7 +23717,7 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[10:11], 0 -; GCN2-NEXT: .LBB141_4: ; %atomicrmw.start +; GCN2: .LBB141_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -23620,7 +23779,7 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB141_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[10:11], 0 -; GCN3-NEXT: .LBB141_4: ; %atomicrmw.start +; GCN3: .LBB141_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -23692,7 +23851,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[10:11], 0 -; GCN1-NEXT: .LBB142_4: ; %atomicrmw.start +; GCN1: .LBB142_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -23761,7 +23920,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[10:11], 0 -; GCN2-NEXT: .LBB142_4: ; %atomicrmw.start +; GCN2: .LBB142_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -23825,7 +23984,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB142_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[10:11], 0 -; GCN3-NEXT: .LBB142_4: ; %atomicrmw.start +; GCN3: .LBB142_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -23891,7 +24050,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[10:11], 0 -; GCN1-NEXT: .LBB143_2: ; %atomicrmw.start +; GCN1: .LBB143_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -23958,7 +24117,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[10:11], 0 -; GCN2-NEXT: .LBB143_2: ; %atomicrmw.start +; GCN2: .LBB143_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -24028,7 +24187,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB143_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[10:11], 0 -; GCN3-NEXT: .LBB143_4: ; %atomicrmw.start +; GCN3: .LBB143_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -24102,7 +24261,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[10:11], 0 -; GCN1-NEXT: .LBB144_4: ; %atomicrmw.start +; GCN1: .LBB144_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -24173,7 +24332,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[10:11], 0 -; GCN2-NEXT: .LBB144_4: ; %atomicrmw.start +; GCN2: .LBB144_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -24239,7 +24398,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB144_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[10:11], 0 -; GCN3-NEXT: .LBB144_4: ; %atomicrmw.start +; GCN3: .LBB144_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -24317,7 +24476,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_mov_b64 s[38:39], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB145_4: ; %atomicrmw.start +; GCN1: .LBB145_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -24391,7 +24550,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_mov_b64 s[38:39], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB145_4: ; %atomicrmw.start +; GCN2: .LBB145_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -24457,7 +24616,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_mov_b64 s[38:39], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB145_4: ; %atomicrmw.start +; GCN3: .LBB145_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -24534,7 +24693,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-NEXT: s_mov_b64 s[40:41], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB146_4: ; %atomicrmw.start +; GCN1: .LBB146_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -24610,7 +24769,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2-NEXT: s_mov_b64 s[40:41], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB146_4: ; %atomicrmw.start +; GCN2: .LBB146_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -24678,7 +24837,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: s_mov_b64 s[40:41], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB146_4: ; %atomicrmw.start +; GCN3: .LBB146_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -24748,7 +24907,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: s_mov_b64 s[38:39], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB147_2: ; %atomicrmw.start +; GCN1: .LBB147_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -24821,7 +24980,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: s_mov_b64 s[38:39], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB147_2: ; %atomicrmw.start +; GCN2: .LBB147_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -24886,7 +25045,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: s_mov_b64 s[38:39], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 -; GCN3-NEXT: .LBB147_2: ; %atomicrmw.start +; GCN3: .LBB147_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -24962,7 +25121,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: s_mov_b64 s[40:41], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB148_2: ; %atomicrmw.start +; GCN1: .LBB148_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -25037,7 +25196,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: s_mov_b64 s[40:41], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB148_2: ; %atomicrmw.start +; GCN2: .LBB148_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -25104,7 +25263,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_mov_b64 s[40:41], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 -; GCN3-NEXT: .LBB148_2: ; %atomicrmw.start +; GCN3: .LBB148_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index 4dea4495b36fb..5a11541fcf54b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -1102,7 +1102,7 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 @@ -1128,7 +1128,7 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 @@ -1151,7 +1151,7 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9: .LBB30_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 @@ -1183,7 +1183,7 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v7, v[0:1] ; GFX7-NEXT: flat_load_dword v6, v[8:9] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 @@ -1211,7 +1211,7 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v7, v[0:1] ; GFX8-NEXT: flat_load_dword v6, v[8:9] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 @@ -1234,7 +1234,7 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9: .LBB31_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 @@ -1265,7 +1265,7 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: flat_load_dword v5, v[5:6] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -1293,7 +1293,7 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -1318,7 +1318,7 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9: .LBB32_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -1352,7 +1352,7 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -1380,7 +1380,7 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -1403,7 +1403,7 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9: .LBB33_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -1443,7 +1443,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 @@ -1476,7 +1476,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 @@ -1504,7 +1504,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9: .LBB34_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 @@ -1541,7 +1541,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: flat_load_dword v2, v[4:5] ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 @@ -1574,7 +1574,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: flat_load_dword v2, v[4:5] ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 @@ -1602,7 +1602,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9: .LBB35_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 @@ -1640,7 +1640,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v8, v1 @@ -1673,7 +1673,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v8, v1 @@ -1701,7 +1701,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9: .LBB36_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -1738,7 +1738,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: flat_load_dword v0, v[2:3] ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v8, v1 @@ -1771,7 +1771,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: flat_load_dword v0, v[2:3] ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v8, v1 @@ -1799,7 +1799,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9: .LBB37_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -1900,7 +1900,7 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 @@ -1926,7 +1926,7 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v5, v7, v3 @@ -1949,7 +1949,7 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX9: .LBB40_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 @@ -1981,7 +1981,7 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v7, v[0:1] ; GFX7-NEXT: flat_load_dword v6, v[8:9] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 @@ -2009,7 +2009,7 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v7, v[0:1] ; GFX8-NEXT: flat_load_dword v6, v[8:9] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v5, v7, v3 @@ -2032,7 +2032,7 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9: .LBB41_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 @@ -2063,7 +2063,7 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: flat_load_dword v5, v[5:6] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -2091,7 +2091,7 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -2116,7 +2116,7 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9: .LBB42_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -2150,7 +2150,7 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -2178,7 +2178,7 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -2201,7 +2201,7 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9: .LBB43_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -2240,7 +2240,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s7, v3 @@ -2272,7 +2272,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, s7, v3 @@ -2299,7 +2299,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9: .LBB44_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 @@ -2335,7 +2335,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s7, v3 @@ -2367,7 +2367,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, s7, v3 @@ -2394,7 +2394,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9: .LBB45_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 @@ -2431,7 +2431,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v1 @@ -2463,7 +2463,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v1 @@ -2490,7 +2490,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9: .LBB46_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v1 @@ -2526,7 +2526,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v1 @@ -2558,7 +2558,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v1 @@ -2585,7 +2585,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9: .LBB47_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v1 @@ -2686,6 +2686,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2714,6 +2715,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2739,6 +2741,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2773,6 +2776,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v7, v[0:1] ; GFX7-NEXT: flat_load_dword v6, v[8:9] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2803,6 +2807,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v7, v[0:1] ; GFX8-NEXT: flat_load_dword v6, v[8:9] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2828,6 +2833,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2861,6 +2867,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: flat_load_dword v5, v[5:6] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2891,6 +2898,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2918,6 +2926,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2954,6 +2963,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2984,6 +2994,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3009,6 +3020,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3050,6 +3062,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3084,6 +3097,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3113,6 +3127,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3151,6 +3166,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3185,6 +3201,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3214,6 +3231,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3253,6 +3271,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3287,6 +3306,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3316,6 +3336,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3354,6 +3375,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3388,6 +3410,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3417,6 +3440,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3452,6 +3476,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX7-NEXT: flat_load_dword v7, v[0:1] ; GFX7-NEXT: flat_load_dword v6, v[8:9] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3482,6 +3507,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX8-NEXT: flat_load_dword v7, v[0:1] ; GFX8-NEXT: flat_load_dword v6, v[8:9] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3507,6 +3533,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3542,6 +3569,7 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3572,6 +3600,7 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3597,6 +3626,7 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3636,7 +3666,7 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v5, v7, v3 @@ -3662,7 +3692,7 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v5, v7, v3 @@ -3685,7 +3715,7 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX9: .LBB60_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 @@ -3717,7 +3747,7 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v7, v[0:1] ; GFX7-NEXT: flat_load_dword v6, v[8:9] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v5, v7, v3 @@ -3745,7 +3775,7 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v7, v[0:1] ; GFX8-NEXT: flat_load_dword v6, v[8:9] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v5, v7, v3 @@ -3768,7 +3798,7 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9: .LBB61_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 @@ -3799,7 +3829,7 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: flat_load_dword v5, v[5:6] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX7: .LBB62_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -3827,7 +3857,7 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX8: .LBB62_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -3852,7 +3882,7 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9: .LBB62_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -3886,7 +3916,7 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX7: .LBB63_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -3914,7 +3944,7 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX8: .LBB63_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3937,7 +3967,7 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9: .LBB63_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -3976,7 +4006,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX7: .LBB64_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v1, s7, v3 @@ -4008,7 +4038,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX8: .LBB64_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v1, s7, v3 @@ -4035,7 +4065,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX9: .LBB64_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 @@ -4071,7 +4101,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX7: .LBB65_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v1, s7, v3 @@ -4103,7 +4133,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX8: .LBB65_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v1, s7, v3 @@ -4130,7 +4160,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX9: .LBB65_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 @@ -4167,7 +4197,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX7: .LBB66_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v1 @@ -4199,7 +4229,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX8: .LBB66_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v1 @@ -4226,7 +4256,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX9: .LBB66_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v1 @@ -4262,7 +4292,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX7: .LBB67_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v1 @@ -4294,7 +4324,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX8: .LBB67_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v1 @@ -4321,7 +4351,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9: .LBB67_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v1 @@ -4422,7 +4452,7 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX7: .LBB70_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -4448,7 +4478,7 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX8: .LBB70_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -4471,7 +4501,7 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX9: .LBB70_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -4503,7 +4533,7 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v7, v[0:1] ; GFX7-NEXT: flat_load_dword v6, v[8:9] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX7: .LBB71_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -4531,7 +4561,7 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v7, v[0:1] ; GFX8-NEXT: flat_load_dword v6, v[8:9] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX8: .LBB71_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -4554,7 +4584,7 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX9: .LBB71_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -4585,7 +4615,7 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: flat_load_dword v5, v[5:6] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX7: .LBB72_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -4613,7 +4643,7 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX8: .LBB72_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -4638,7 +4668,7 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9: .LBB72_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -4672,7 +4702,7 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX7: .LBB73_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -4700,7 +4730,7 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX8: .LBB73_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -4723,7 +4753,7 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9: .LBB73_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -4762,7 +4792,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX7: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -4794,7 +4824,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX8: .LBB74_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -4821,7 +4851,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9: .LBB74_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -4857,7 +4887,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX7: .LBB75_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -4889,7 +4919,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX8: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -4916,7 +4946,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9: .LBB75_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -4953,7 +4983,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX7: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v1 @@ -4985,7 +5015,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX8: .LBB76_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v1 @@ -5012,7 +5042,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9: .LBB76_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v1 @@ -5048,7 +5078,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX7: .LBB77_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v1 @@ -5080,7 +5110,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX8: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v1 @@ -5107,7 +5137,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9: .LBB77_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v1 @@ -5208,6 +5238,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5235,6 +5266,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5259,6 +5291,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5292,6 +5325,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v7, v[0:1] ; GFX7-NEXT: flat_load_dword v6, v[8:9] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5321,6 +5355,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v7, v[0:1] ; GFX8-NEXT: flat_load_dword v6, v[8:9] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5345,6 +5380,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5377,6 +5413,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: flat_load_dword v5, v[5:6] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5406,6 +5443,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5432,6 +5470,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5467,6 +5506,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5496,6 +5536,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5520,6 +5561,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5562,6 +5604,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5597,6 +5640,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5627,6 +5671,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5666,6 +5711,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5701,6 +5747,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5731,6 +5778,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5771,6 +5819,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5806,6 +5855,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5836,6 +5886,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5875,6 +5926,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5910,6 +5962,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5940,6 +5993,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5980,6 +6034,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6014,6 +6069,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6046,6 +6102,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6086,6 +6143,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6123,6 +6181,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6158,6 +6217,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s13 ; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6202,6 +6262,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6234,6 +6295,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6266,6 +6328,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6303,6 +6366,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6338,6 +6402,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6373,6 +6438,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s13 ; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6480,6 +6546,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6507,6 +6574,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6531,6 +6599,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6564,6 +6633,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v7, v[0:1] ; GFX7-NEXT: flat_load_dword v6, v[8:9] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB95_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6593,6 +6663,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v7, v[0:1] ; GFX8-NEXT: flat_load_dword v6, v[8:9] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB95_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6617,6 +6688,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6649,6 +6721,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: flat_load_dword v5, v[5:6] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB96_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6678,6 +6751,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB96_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6704,6 +6778,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6739,6 +6814,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB97_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6768,6 +6844,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB97_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6792,6 +6869,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6834,6 +6912,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6869,6 +6948,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6899,6 +6979,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6938,6 +7019,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6973,6 +7055,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7003,6 +7086,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7043,6 +7127,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7078,6 +7163,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7108,6 +7194,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7147,6 +7234,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7182,6 +7270,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7212,6 +7301,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7252,6 +7342,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7286,6 +7377,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7318,6 +7410,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7358,6 +7451,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7395,6 +7489,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7430,6 +7525,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s13 ; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7473,6 +7569,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7508,6 +7605,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7543,6 +7641,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s13 ; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7650,6 +7749,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7677,6 +7777,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7701,6 +7802,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7734,6 +7836,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v7, v[0:1] ; GFX7-NEXT: flat_load_dword v6, v[8:9] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7763,6 +7866,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v7, v[0:1] ; GFX8-NEXT: flat_load_dword v6, v[8:9] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7787,6 +7891,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7819,6 +7924,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: flat_load_dword v5, v[5:6] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB109_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7848,6 +7954,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB109_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7874,6 +7981,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7909,6 +8017,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7938,6 +8047,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7962,6 +8072,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8004,6 +8115,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8039,6 +8151,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8069,6 +8182,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8108,6 +8222,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8143,6 +8258,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8173,6 +8289,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8213,6 +8330,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8248,6 +8366,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8278,6 +8397,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8317,6 +8437,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8352,6 +8473,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8382,6 +8504,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8484,6 +8607,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB117_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8511,6 +8635,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB117_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8535,6 +8660,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8568,6 +8694,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v7, v[0:1] ; GFX7-NEXT: flat_load_dword v6, v[8:9] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8597,6 +8724,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v7, v[0:1] ; GFX8-NEXT: flat_load_dword v6, v[8:9] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8621,6 +8749,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8653,6 +8782,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: flat_load_dword v5, v[5:6] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8682,6 +8812,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8708,6 +8839,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8743,6 +8875,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8772,6 +8905,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8796,6 +8930,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8838,6 +8973,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8873,6 +9009,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8903,6 +9040,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8942,6 +9080,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8977,6 +9116,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9007,6 +9147,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9047,6 +9188,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9082,6 +9224,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9112,6 +9255,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9151,6 +9295,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9186,6 +9331,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9216,6 +9362,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9256,6 +9403,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9290,6 +9438,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9322,6 +9471,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9362,6 +9512,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9399,6 +9550,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9434,6 +9586,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s13 ; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9476,6 +9629,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9506,6 +9660,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9536,6 +9691,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_mov_b32_e32 v7, s2 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9572,6 +9728,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .p2align ; GFX7-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9607,6 +9764,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9642,6 +9800,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s13 ; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9749,7 +9908,7 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX7: .LBB131_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 1, v6 @@ -9778,7 +9937,7 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX8: .LBB131_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v6 @@ -9804,7 +9963,7 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX9: .LBB131_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 @@ -9839,7 +9998,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v7, v[0:1] ; GFX7-NEXT: flat_load_dword v6, v[8:9] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX7: .LBB132_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 @@ -9870,7 +10029,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v7, v[0:1] ; GFX8-NEXT: flat_load_dword v6, v[8:9] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX8: .LBB132_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 @@ -9896,7 +10055,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX9: .LBB132_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 @@ -9930,7 +10089,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: flat_load_dword v5, v[5:6] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX7: .LBB133_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -9961,7 +10120,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX8: .LBB133_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -9989,7 +10148,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX9: .LBB133_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -10026,7 +10185,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX7: .LBB134_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -10057,7 +10216,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX8: .LBB134_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -10083,7 +10242,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX9: .LBB134_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -10125,7 +10284,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX7: .LBB135_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 @@ -10160,7 +10319,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX8: .LBB135_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 @@ -10190,7 +10349,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX9: .LBB135_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 @@ -10229,7 +10388,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX7: .LBB136_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 @@ -10264,7 +10423,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX8: .LBB136_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 @@ -10294,7 +10453,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX9: .LBB136_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 @@ -10334,7 +10493,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX7: .LBB137_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v1 @@ -10369,7 +10528,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX8: .LBB137_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v1 @@ -10399,7 +10558,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX9: .LBB137_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v1 @@ -10438,7 +10597,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX7: .LBB138_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v1 @@ -10473,7 +10632,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX8: .LBB138_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v1 @@ -10503,7 +10662,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX9: .LBB138_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v1 @@ -10607,7 +10766,7 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX7: .LBB141_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -10638,7 +10797,7 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX8: .LBB141_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -10666,7 +10825,7 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX9-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX9: .LBB141_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -10703,7 +10862,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v7, v[0:1] ; GFX7-NEXT: flat_load_dword v6, v[8:9] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX7: .LBB142_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -10736,7 +10895,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v7, v[0:1] ; GFX8-NEXT: flat_load_dword v6, v[8:9] ; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX8: .LBB142_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -10764,7 +10923,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX9: .LBB142_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -10800,7 +10959,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: flat_load_dword v5, v[5:6] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX7: .LBB143_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -10833,7 +10992,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX8: .LBB143_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -10863,7 +11022,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX9-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX9: .LBB143_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -10902,7 +11061,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX7: .LBB144_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -10935,7 +11094,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] ; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX8: .LBB144_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -10963,7 +11122,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX9: .LBB144_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -11009,7 +11168,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX7: .LBB145_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -11048,7 +11207,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX8: .LBB145_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -11082,7 +11241,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX9: .LBB145_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -11125,7 +11284,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX7-NEXT: s_mov_b64 s[38:39], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX7: .LBB146_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -11164,7 +11323,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX8-NEXT: s_mov_b64 s[38:39], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX8: .LBB146_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -11198,7 +11357,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX9: .LBB146_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -11242,7 +11401,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX7: .LBB147_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -11281,7 +11440,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX8: .LBB147_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -11315,7 +11474,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX9: .LBB147_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v9, v1 @@ -11358,7 +11517,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX7-NEXT: s_mov_b64 s[38:39], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX7: .LBB148_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -11397,7 +11556,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX8-NEXT: s_mov_b64 s[38:39], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX8: .LBB148_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -11431,7 +11590,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX9: .LBB148_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v9, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll index 6ef89a4ccd485..4153f00e9a6c6 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll @@ -162,6 +162,7 @@ define float @fold_abs_in_branch_phi(float %arg1, float %arg2) { ; GFX10-NEXT: s_cbranch_execz .LBB5_3 ; GFX10-NEXT: ; %bb.1: ; %header.preheader ; GFX10-NEXT: ; implicit-def: $vgpr0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB5_2: ; %header ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x40400000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 0cb2b0b7df3d2..f44d34d9ca145 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1734,6 +1734,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 @@ -2031,6 +2032,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll index 6b02e6b05f1b7..7232cd4958260 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll @@ -22,6 +22,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_mov_b32_e32 v5, v1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 9c1f9d21b9da3..21130749b5653 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -54,7 +54,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX10: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -78,7 +78,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -100,7 +100,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX908: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -122,7 +122,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX8: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -148,7 +148,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -176,7 +176,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX6: .LBB0_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -239,7 +239,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX10: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -263,7 +263,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX90A: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -285,7 +285,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX908: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -309,7 +309,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX8: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -334,7 +334,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -362,7 +362,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX6: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -426,7 +426,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX10: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -450,7 +450,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX90A: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -472,7 +472,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX908: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -496,7 +496,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX8: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -527,7 +527,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7: .LBB2_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v0 @@ -560,7 +560,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX6: .LBB2_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v6, v0 @@ -623,7 +623,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX10: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -646,7 +646,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -667,7 +667,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX908: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -688,7 +688,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX8: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -713,7 +713,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -740,7 +740,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX6: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -802,7 +802,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX10: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -825,7 +825,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -846,7 +846,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -869,7 +869,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -894,7 +894,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -921,7 +921,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX6: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -984,7 +984,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX10: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1007,7 +1007,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX90A: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -1028,7 +1028,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1051,7 +1051,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1080,7 +1080,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7: .LBB5_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1111,7 +1111,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX6: .LBB5_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1175,7 +1175,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -1199,7 +1199,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1223,7 +1223,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1247,7 +1247,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -1272,7 +1272,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -1300,7 +1300,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX6: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -1365,7 +1365,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1388,7 +1388,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -1411,7 +1411,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1434,7 +1434,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1459,7 +1459,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1486,7 +1486,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX6: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1539,7 +1539,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -1565,7 +1565,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -1589,7 +1589,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1611,7 +1611,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1635,7 +1635,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -1660,7 +1660,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -1688,7 +1688,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -1742,7 +1742,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -1768,7 +1768,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -1792,7 +1792,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1814,7 +1814,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1838,7 +1838,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -1863,7 +1863,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -1891,7 +1891,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -1945,7 +1945,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1969,7 +1969,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -1992,7 +1992,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -2013,7 +2013,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2036,7 +2036,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2061,7 +2061,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2088,7 +2088,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2151,7 +2151,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -2175,7 +2175,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -2197,7 +2197,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2221,7 +2221,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -2246,7 +2246,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -2274,7 +2274,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX6: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -2338,7 +2338,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -2370,7 +2370,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2394,7 +2394,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -2419,7 +2419,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -2447,7 +2447,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -2501,7 +2501,7 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -2527,7 +2527,7 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -2551,7 +2551,7 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -2573,7 +2573,7 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2597,7 +2597,7 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -2622,7 +2622,7 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -2650,7 +2650,7 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -2704,7 +2704,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2728,7 +2728,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2751,7 +2751,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -2772,7 +2772,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2795,7 +2795,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2820,7 +2820,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2847,7 +2847,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX6: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2910,7 +2910,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2933,7 +2933,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -2954,7 +2954,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -2977,7 +2977,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3002,7 +3002,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3029,7 +3029,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6: .LBB15_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3092,7 +3092,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3133,7 +3133,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3158,7 +3158,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3185,7 +3185,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3238,7 +3238,7 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3262,7 +3262,7 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3285,7 +3285,7 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -3306,7 +3306,7 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3329,7 +3329,7 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3354,7 +3354,7 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3381,7 +3381,7 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3434,7 +3434,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -3460,7 +3460,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -3484,7 +3484,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -3506,7 +3506,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -3528,7 +3528,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -3554,7 +3554,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -3582,7 +3582,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX6: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -3635,7 +3635,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3659,7 +3659,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3682,7 +3682,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX90A: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -3703,7 +3703,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3724,7 +3724,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3749,7 +3749,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3776,7 +3776,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX6: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -3828,7 +3828,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -3854,7 +3854,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -3878,7 +3878,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX90A: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -3900,7 +3900,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -3922,7 +3922,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -3948,7 +3948,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -3976,7 +3976,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX6: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -4029,7 +4029,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4053,7 +4053,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX10: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4076,7 +4076,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX90A: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -4097,7 +4097,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4118,7 +4118,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4143,7 +4143,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX7: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4170,7 +4170,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX6: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4232,7 +4232,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -4256,7 +4256,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -4278,7 +4278,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -4300,7 +4300,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -4326,7 +4326,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -4354,7 +4354,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -4417,7 +4417,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4440,7 +4440,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -4461,7 +4461,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4482,7 +4482,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4507,7 +4507,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4534,7 +4534,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -4600,7 +4600,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -4632,7 +4632,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -4654,7 +4654,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -4680,7 +4680,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -4708,7 +4708,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -4771,7 +4771,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -4803,7 +4803,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -4827,7 +4827,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -4852,7 +4852,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -4880,7 +4880,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -4944,7 +4944,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -4976,7 +4976,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -5000,7 +5000,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -5031,7 +5031,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v0 @@ -5064,7 +5064,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v6, v0 @@ -5127,7 +5127,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5166,7 +5166,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5191,7 +5191,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5218,7 +5218,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5280,7 +5280,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5321,7 +5321,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5346,7 +5346,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5373,7 +5373,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5436,7 +5436,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5477,7 +5477,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5506,7 +5506,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5537,7 +5537,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5601,7 +5601,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -5635,7 +5635,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -5659,7 +5659,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -5684,7 +5684,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -5712,7 +5712,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -5777,7 +5777,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5820,7 +5820,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5845,7 +5845,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5872,7 +5872,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -5935,7 +5935,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -5967,7 +5967,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -5991,7 +5991,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -6016,7 +6016,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -6044,7 +6044,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -6108,7 +6108,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6149,7 +6149,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6174,7 +6174,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6201,7 +6201,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6254,7 +6254,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11: .LBB34_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -6280,7 +6280,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -6304,7 +6304,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -6326,7 +6326,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -6348,7 +6348,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -6374,7 +6374,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -6402,7 +6402,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -6455,7 +6455,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6479,7 +6479,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6502,7 +6502,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -6523,7 +6523,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6544,7 +6544,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6569,7 +6569,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6596,7 +6596,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX6: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6658,7 +6658,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -6690,7 +6690,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -6712,7 +6712,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -6738,7 +6738,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -6766,7 +6766,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX6: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -6829,7 +6829,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6868,7 +6868,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6893,7 +6893,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6920,7 +6920,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX6: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 @@ -6957,7 +6957,7 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -6993,7 +6993,7 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11: .LBB38_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -7019,7 +7019,7 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -7053,7 +7053,7 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -7077,7 +7077,7 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -7109,7 +7109,7 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 @@ -7143,7 +7143,7 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX6: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v11, v1 @@ -7179,7 +7179,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12: .LBB39_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -7215,7 +7215,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11: .LBB39_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -7241,7 +7241,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -7275,7 +7275,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -7301,7 +7301,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -7331,7 +7331,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 @@ -7365,7 +7365,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX6: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v11, v1 @@ -7402,7 +7402,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -7438,7 +7438,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11: .LBB40_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -7464,7 +7464,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -7498,7 +7498,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -7524,7 +7524,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -7558,7 +7558,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 @@ -7596,7 +7596,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX6: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v11, v1 @@ -7633,7 +7633,7 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] @@ -7667,7 +7667,7 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11: .LBB41_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7691,7 +7691,7 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7723,7 +7723,7 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7745,7 +7745,7 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7771,7 +7771,7 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7801,7 +7801,7 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX6: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7837,7 +7837,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12: .LBB42_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] @@ -7871,7 +7871,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7895,7 +7895,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7927,7 +7927,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7951,7 +7951,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -7977,7 +7977,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8007,7 +8007,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX6: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8044,7 +8044,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12: .LBB43_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] @@ -8078,7 +8078,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8102,7 +8102,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8134,7 +8134,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8158,7 +8158,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8188,7 +8188,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8222,7 +8222,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX6: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -8271,7 +8271,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8315,7 +8315,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB44_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8355,7 +8355,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -8389,7 +8389,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB44_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8428,7 +8428,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB44_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8466,7 +8466,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8500,7 +8500,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -8532,7 +8532,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -8564,7 +8564,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8602,7 +8602,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -8645,7 +8645,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX6: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -8694,7 +8694,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8739,7 +8739,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB45_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8781,7 +8781,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942: .LBB45_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -8816,7 +8816,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB45_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8856,7 +8856,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB45_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8895,7 +8895,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8930,7 +8930,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -8963,7 +8963,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -8996,7 +8996,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9035,7 +9035,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -9079,7 +9079,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX6: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -9130,7 +9130,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB46_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9175,7 +9175,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB46_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9218,7 +9218,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -9253,7 +9253,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB46_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9293,7 +9293,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB46_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9332,7 +9332,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -9367,7 +9367,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -9400,7 +9400,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -9433,7 +9433,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9472,7 +9472,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -9516,7 +9516,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX6: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -9566,7 +9566,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB47_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9609,7 +9609,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB47_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9647,7 +9647,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -9680,7 +9680,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB47_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9718,7 +9718,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB47_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9754,7 +9754,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9787,7 +9787,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -9818,7 +9818,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9849,7 +9849,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9886,7 +9886,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -9927,7 +9927,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX6: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -9975,7 +9975,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB48_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10019,7 +10019,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB48_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10059,7 +10059,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -10093,7 +10093,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB48_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10132,7 +10132,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB48_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10169,7 +10169,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10203,7 +10203,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -10235,7 +10235,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10267,7 +10267,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10305,7 +10305,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10347,7 +10347,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10396,7 +10396,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB49_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10440,7 +10440,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB49_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10481,7 +10481,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -10515,7 +10515,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB49_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10554,7 +10554,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB49_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10591,7 +10591,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10625,7 +10625,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -10657,7 +10657,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10689,7 +10689,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10727,7 +10727,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10769,7 +10769,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10809,7 +10809,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB50_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -10842,7 +10842,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB50_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -10873,7 +10873,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -10897,7 +10897,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB50_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -10925,7 +10925,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB50_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -10954,7 +10954,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -10981,7 +10981,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11005,7 +11005,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11030,7 +11030,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -11059,7 +11059,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -11093,7 +11093,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -11132,7 +11132,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB51_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l @@ -11164,7 +11164,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB51_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2 @@ -11193,7 +11193,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f16_e32 v3, v5, v2 @@ -11216,7 +11216,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB51_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l @@ -11243,7 +11243,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB51_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2 @@ -11270,7 +11270,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 @@ -11296,7 +11296,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 @@ -11319,7 +11319,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 @@ -11343,7 +11343,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_e32 v3, v4, v2 @@ -11372,7 +11372,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -11405,7 +11405,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -11452,7 +11452,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB52_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -11498,7 +11498,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB52_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -11541,7 +11541,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -11576,7 +11576,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB52_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -11616,7 +11616,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB52_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -11655,7 +11655,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -11690,7 +11690,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -11725,7 +11725,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -11758,7 +11758,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -11797,7 +11797,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -11841,7 +11841,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -11892,7 +11892,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB53_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11937,7 +11937,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB53_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11978,7 +11978,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -12012,7 +12012,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB53_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -12051,7 +12051,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB53_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -12088,7 +12088,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -12122,7 +12122,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -12156,7 +12156,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -12188,7 +12188,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -12226,7 +12226,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -12268,7 +12268,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -12320,7 +12320,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12377,7 +12377,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12428,7 +12428,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -12569,7 +12569,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -12610,7 +12610,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -12649,7 +12649,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -12687,7 +12687,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -12731,7 +12731,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -12774,7 +12774,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -12823,7 +12823,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12883,7 +12883,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12936,7 +12936,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -13081,7 +13081,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -13123,7 +13123,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -13163,7 +13163,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -13202,7 +13202,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -13247,7 +13247,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -13291,7 +13291,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -13342,7 +13342,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -13402,7 +13402,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -13456,7 +13456,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX942: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -13601,7 +13601,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -13643,7 +13643,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -13683,7 +13683,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -13722,7 +13722,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -13767,7 +13767,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -13811,7 +13811,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -13861,7 +13861,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -13916,7 +13916,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -13965,7 +13965,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX942: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14101,7 +14101,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14141,7 +14141,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14179,7 +14179,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14216,7 +14216,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14259,7 +14259,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -14300,7 +14300,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -14348,7 +14348,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB58_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -14406,7 +14406,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB58_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -14457,7 +14457,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX942: .LBB58_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14597,7 +14597,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX10: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14638,7 +14638,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14677,7 +14677,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX908: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14715,7 +14715,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14759,7 +14759,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -14801,7 +14801,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX6: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -14850,7 +14850,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB59_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -14908,7 +14908,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB59_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -14960,7 +14960,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX942: .LBB59_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -15100,7 +15100,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX10: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -15141,7 +15141,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX90A: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -15180,7 +15180,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX908: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -15218,7 +15218,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -15262,7 +15262,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -15304,7 +15304,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX6: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -15344,7 +15344,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB60_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -15391,7 +15391,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB60_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -15433,7 +15433,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX942: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -15546,7 +15546,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX10: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -15581,7 +15581,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX90A: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -15614,7 +15614,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX908: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -15647,7 +15647,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -15683,7 +15683,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15717,7 +15717,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX6: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15756,7 +15756,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB61_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l @@ -15801,7 +15801,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB61_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15841,7 +15841,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX942: .LBB61_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15949,7 +15949,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX10: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15983,7 +15983,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16015,7 +16015,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX908: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16047,7 +16047,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16083,7 +16083,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16116,7 +16116,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX6: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16163,7 +16163,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB62_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -16224,7 +16224,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB62_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -16278,7 +16278,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX942: .LBB62_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -16423,7 +16423,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX10: .LBB62_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -16465,7 +16465,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX90A: .LBB62_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -16507,7 +16507,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX908: .LBB62_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -16546,7 +16546,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX8: .LBB62_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -16591,7 +16591,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX7: .LBB62_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -16635,7 +16635,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX6: .LBB62_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -16686,7 +16686,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB63_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -16745,7 +16745,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB63_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -16797,7 +16797,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX942: .LBB63_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -16937,7 +16937,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX10: .LBB63_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -16978,7 +16978,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX90A: .LBB63_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -17019,7 +17019,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX908: .LBB63_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -17057,7 +17057,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX8: .LBB63_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -17101,7 +17101,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX7: .LBB63_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -17143,7 +17143,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX6: .LBB63_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -17206,7 +17206,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX11: .LBB64_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -17232,7 +17232,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX10: .LBB64_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -17264,7 +17264,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX908: .LBB64_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -17286,7 +17286,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX8: .LBB64_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -17322,7 +17322,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX7: .LBB64_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -17371,7 +17371,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX6: .LBB64_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -17437,7 +17437,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX11: .LBB65_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -17463,7 +17463,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX10: .LBB65_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -17495,7 +17495,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX908: .LBB65_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -17519,7 +17519,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX8: .LBB65_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -17554,7 +17554,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX7: .LBB65_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -17603,7 +17603,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX6: .LBB65_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -17670,7 +17670,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX11: .LBB66_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -17696,7 +17696,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX10: .LBB66_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -17728,7 +17728,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX908: .LBB66_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -17752,7 +17752,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX8: .LBB66_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -17791,7 +17791,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX7: .LBB66_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -17842,7 +17842,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX6: .LBB66_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -17907,7 +17907,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX11: .LBB67_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 @@ -17931,7 +17931,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX10: .LBB67_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -17970,7 +17970,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX8: .LBB67_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -18005,7 +18005,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX7: .LBB67_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -18052,7 +18052,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX6: .LBB67_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -18116,7 +18116,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX11: .LBB68_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 @@ -18140,7 +18140,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX10: .LBB68_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -18181,7 +18181,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX8: .LBB68_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -18216,7 +18216,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX7: .LBB68_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -18263,7 +18263,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX6: .LBB68_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -18328,7 +18328,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX11: .LBB69_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 @@ -18352,7 +18352,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX10: .LBB69_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -18393,7 +18393,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX8: .LBB69_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -18432,7 +18432,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX7: .LBB69_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -18483,7 +18483,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX6: .LBB69_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -18549,7 +18549,7 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX11: .LBB70_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -18575,7 +18575,7 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX10: .LBB70_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -18609,7 +18609,7 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX908: .LBB70_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -18633,7 +18633,7 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX8: .LBB70_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -18668,7 +18668,7 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX7: .LBB70_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -18717,7 +18717,7 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX6: .LBB70_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -18785,7 +18785,7 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX11: .LBB71_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 @@ -18809,7 +18809,7 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX10: .LBB71_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -18852,7 +18852,7 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX8: .LBB71_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -18887,7 +18887,7 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX7: .LBB71_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -18934,7 +18934,7 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX6: .LBB71_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -18999,7 +18999,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX11: .LBB72_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -19025,7 +19025,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX10: .LBB72_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -19049,7 +19049,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX90A: .LBB72_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -19071,7 +19071,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX908: .LBB72_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -19093,7 +19093,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX8: .LBB72_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -19129,7 +19129,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX7: .LBB72_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -19178,7 +19178,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX6: .LBB72_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -19244,7 +19244,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX11: .LBB73_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 @@ -19268,7 +19268,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX10: .LBB73_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -19291,7 +19291,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX90A: .LBB73_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 @@ -19312,7 +19312,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX908: .LBB73_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 @@ -19333,7 +19333,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX8: .LBB73_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -19368,7 +19368,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX7: .LBB73_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -19415,7 +19415,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX6: .LBB73_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -19479,7 +19479,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX11: .LBB74_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -19505,7 +19505,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX10: .LBB74_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -19537,7 +19537,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX908: .LBB74_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -19559,7 +19559,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX8: .LBB74_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -19595,7 +19595,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX7: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -19644,7 +19644,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX6: .LBB74_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -19710,7 +19710,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX11: .LBB75_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 @@ -19734,7 +19734,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX10: .LBB75_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -19773,7 +19773,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX8: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -19808,7 +19808,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX7: .LBB75_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -19855,7 +19855,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX6: .LBB75_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -19919,7 +19919,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX11: .LBB76_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -19945,7 +19945,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX10: .LBB76_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -19969,7 +19969,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX90A: .LBB76_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -19991,7 +19991,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX908: .LBB76_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -20013,7 +20013,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX8: .LBB76_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -20049,7 +20049,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX7: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -20098,7 +20098,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX6: .LBB76_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -20164,7 +20164,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX11: .LBB77_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 @@ -20188,7 +20188,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX10: .LBB77_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 @@ -20211,7 +20211,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX90A: .LBB77_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 @@ -20232,7 +20232,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX908: .LBB77_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 @@ -20253,7 +20253,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX8: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -20288,7 +20288,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX7: .LBB77_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -20335,7 +20335,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX6: .LBB77_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -20504,7 +20504,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX10: .LBB78_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -20546,7 +20546,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX90A: .LBB78_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -20586,7 +20586,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX908: .LBB78_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -20624,7 +20624,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX8: .LBB78_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -20674,7 +20674,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX7: .LBB78_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -20719,7 +20719,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX6: .LBB78_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -20883,7 +20883,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX10: .LBB79_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -20925,7 +20925,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX90A: .LBB79_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -20965,7 +20965,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX908: .LBB79_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -21005,7 +21005,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX8: .LBB79_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -21054,7 +21054,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX7: .LBB79_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -21099,7 +21099,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX6: .LBB79_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -21264,7 +21264,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX10: .LBB80_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -21306,7 +21306,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX90A: .LBB80_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -21346,7 +21346,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX908: .LBB80_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -21386,7 +21386,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX8: .LBB80_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -21439,7 +21439,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX7: .LBB80_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -21486,7 +21486,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX6: .LBB80_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -21645,7 +21645,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX10: .LBB81_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21686,7 +21686,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX90A: .LBB81_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21725,7 +21725,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX908: .LBB81_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21762,7 +21762,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX8: .LBB81_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21811,7 +21811,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX7: .LBB81_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -21854,7 +21854,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX6: .LBB81_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -22012,7 +22012,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX10: .LBB82_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22053,7 +22053,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX90A: .LBB82_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22092,7 +22092,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX908: .LBB82_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22131,7 +22131,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX8: .LBB82_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22180,7 +22180,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX7: .LBB82_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -22223,7 +22223,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX6: .LBB82_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -22382,7 +22382,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX10: .LBB83_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22423,7 +22423,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX90A: .LBB83_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22462,7 +22462,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX908: .LBB83_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22501,7 +22501,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX8: .LBB83_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22554,7 +22554,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX7: .LBB83_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -22601,7 +22601,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX6: .LBB83_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -22765,7 +22765,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX10: .LBB84_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -22807,7 +22807,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX90A: .LBB84_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -22849,7 +22849,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX908: .LBB84_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -22889,7 +22889,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX8: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -22938,7 +22938,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX7: .LBB84_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -22983,7 +22983,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX6: .LBB84_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -23145,7 +23145,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX10: .LBB85_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -23186,7 +23186,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX90A: .LBB85_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -23227,7 +23227,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX908: .LBB85_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -23266,7 +23266,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX8: .LBB85_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -23315,7 +23315,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX7: .LBB85_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -23358,7 +23358,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX6: .LBB85_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -23521,7 +23521,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX10: .LBB86_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -23563,7 +23563,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX90A: .LBB86_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -23603,7 +23603,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX908: .LBB86_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -23641,7 +23641,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX8: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -23691,7 +23691,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX7: .LBB86_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -23736,7 +23736,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX6: .LBB86_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -23896,7 +23896,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX10: .LBB87_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -23937,7 +23937,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX90A: .LBB87_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -23976,7 +23976,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX908: .LBB87_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -24013,7 +24013,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX8: .LBB87_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -24062,7 +24062,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX7: .LBB87_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -24105,7 +24105,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX6: .LBB87_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -24267,7 +24267,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX10: .LBB88_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -24309,7 +24309,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX90A: .LBB88_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -24349,7 +24349,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX908: .LBB88_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -24387,7 +24387,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX8: .LBB88_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -24437,7 +24437,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX7: .LBB88_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -24482,7 +24482,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX6: .LBB88_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -24642,7 +24642,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX10: .LBB89_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -24683,7 +24683,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX90A: .LBB89_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -24722,7 +24722,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX908: .LBB89_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -24759,7 +24759,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX8: .LBB89_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -24808,7 +24808,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX7: .LBB89_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -24851,7 +24851,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX6: .LBB89_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -25013,7 +25013,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX10: .LBB90_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -25055,7 +25055,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX90A: .LBB90_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -25095,7 +25095,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX908: .LBB90_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -25133,7 +25133,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX8: .LBB90_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -25183,7 +25183,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX7: .LBB90_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -25228,7 +25228,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX6: .LBB90_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -25388,7 +25388,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX10: .LBB91_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -25429,7 +25429,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX90A: .LBB91_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -25468,7 +25468,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX908: .LBB91_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -25505,7 +25505,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX8: .LBB91_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -25554,7 +25554,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX7: .LBB91_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -25597,7 +25597,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX6: .LBB91_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index f7cc0709109f9..6d18d640a7eb3 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -36,7 +36,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -81,7 +81,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -105,7 +105,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX908: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -129,7 +129,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX8: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -198,7 +198,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -243,7 +243,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX90A: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -267,7 +267,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX908: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -293,7 +293,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX8: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -362,7 +362,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX942: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -407,7 +407,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX90A: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -431,7 +431,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX908: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -457,7 +457,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX8: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -526,7 +526,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -570,7 +570,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -593,7 +593,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX908: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -616,7 +616,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX8: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -682,7 +682,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -726,7 +726,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -749,7 +749,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -774,7 +774,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -841,7 +841,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -885,7 +885,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX90A: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -908,7 +908,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -933,7 +933,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -1001,7 +1001,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX942: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1046,7 +1046,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1072,7 +1072,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1098,7 +1098,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -1168,7 +1168,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX942: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -1212,7 +1212,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -1237,7 +1237,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -1262,7 +1262,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -1329,7 +1329,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1354,7 +1354,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -1382,7 +1382,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -1408,7 +1408,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1432,7 +1432,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1456,7 +1456,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1484,7 +1484,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -1514,7 +1514,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -1560,7 +1560,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1605,7 +1605,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1629,7 +1629,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1653,7 +1653,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1726,7 +1726,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1771,7 +1771,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1795,7 +1795,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1819,7 +1819,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1888,7 +1888,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1933,7 +1933,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1957,7 +1957,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1983,7 +1983,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -2052,7 +2052,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -2097,7 +2097,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -2121,7 +2121,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2147,7 +2147,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -2216,7 +2216,7 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2260,7 +2260,7 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2283,7 +2283,7 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2306,7 +2306,7 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2372,7 +2372,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2416,7 +2416,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2439,7 +2439,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2464,7 +2464,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2531,7 +2531,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2575,7 +2575,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2598,7 +2598,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2623,7 +2623,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2691,7 +2691,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -2736,7 +2736,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -2762,7 +2762,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2788,7 +2788,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -2858,7 +2858,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2902,7 +2902,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2927,7 +2927,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2952,7 +2952,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -3013,7 +3013,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3051,7 +3051,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3097,7 +3097,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -3123,7 +3123,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -3186,7 +3186,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3224,7 +3224,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3270,7 +3270,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -3298,7 +3298,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3360,7 +3360,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3398,7 +3398,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3444,7 +3444,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -3472,7 +3472,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3534,7 +3534,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3571,7 +3571,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3616,7 +3616,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3640,7 +3640,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3697,7 +3697,7 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3734,7 +3734,7 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3779,7 +3779,7 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3805,7 +3805,7 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3863,7 +3863,7 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3900,7 +3900,7 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3945,7 +3945,7 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3971,7 +3971,7 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -4029,7 +4029,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -4067,7 +4067,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -4095,7 +4095,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -4123,7 +4123,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] @@ -4148,7 +4148,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -4174,7 +4174,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -4206,7 +4206,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 ; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 @@ -4240,7 +4240,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 ; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v11, v1 @@ -4278,7 +4278,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -4316,7 +4316,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -4362,7 +4362,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -4388,7 +4388,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -4463,7 +4463,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -4510,7 +4510,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -4552,7 +4552,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -4588,7 +4588,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -4630,7 +4630,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -4670,7 +4670,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4706,7 +4706,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4740,7 +4740,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4774,7 +4774,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4813,7 +4813,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4856,7 +4856,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4908,7 +4908,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -4957,7 +4957,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5001,7 +5001,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -5040,7 +5040,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5083,7 +5083,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5124,7 +5124,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -5161,7 +5161,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -5196,7 +5196,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -5231,7 +5231,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -5271,7 +5271,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5315,7 +5315,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5369,7 +5369,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5418,7 +5418,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5463,7 +5463,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX942: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -5502,7 +5502,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5545,7 +5545,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB28_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5586,7 +5586,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -5623,7 +5623,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -5658,7 +5658,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -5693,7 +5693,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -5733,7 +5733,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5777,7 +5777,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5828,7 +5828,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB29_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5873,7 +5873,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB29_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5914,7 +5914,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX942: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5949,7 +5949,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB29_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5989,7 +5989,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB29_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6028,7 +6028,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6063,7 +6063,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -6096,7 +6096,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6129,7 +6129,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6167,7 +6167,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -6208,7 +6208,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -6259,7 +6259,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -6306,7 +6306,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6349,7 +6349,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX942: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6387,7 +6387,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -6428,7 +6428,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6468,7 +6468,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6504,7 +6504,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6538,7 +6538,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6572,7 +6572,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6611,7 +6611,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6653,7 +6653,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6705,7 +6705,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -6752,7 +6752,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6796,7 +6796,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX942: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6834,7 +6834,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -6875,7 +6875,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6915,7 +6915,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6951,7 +6951,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6985,7 +6985,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7019,7 +7019,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7058,7 +7058,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7100,7 +7100,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7141,7 +7141,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -7177,7 +7177,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -7210,7 +7210,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX942: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -7236,7 +7236,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -7267,7 +7267,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -7298,7 +7298,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -7327,7 +7327,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -7353,7 +7353,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -7380,7 +7380,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -7410,7 +7410,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -7444,7 +7444,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -7484,7 +7484,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l @@ -7518,7 +7518,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 @@ -7550,7 +7550,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX942: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7575,7 +7575,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l @@ -7604,7 +7604,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7634,7 +7634,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7662,7 +7662,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7687,7 +7687,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7713,7 +7713,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7743,7 +7743,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -7776,7 +7776,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -7826,7 +7826,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -7876,7 +7876,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -7921,7 +7921,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX942: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -7960,7 +7960,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8003,7 +8003,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB34_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8044,7 +8044,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8081,7 +8081,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -8118,7 +8118,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8153,7 +8153,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8193,7 +8193,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8237,7 +8237,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8291,7 +8291,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -8339,7 +8339,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8383,7 +8383,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX942: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8421,7 +8421,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -8462,7 +8462,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8502,7 +8502,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8538,7 +8538,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8574,7 +8574,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8608,7 +8608,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8647,7 +8647,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8689,7 +8689,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX6: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8741,7 +8741,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB36_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8798,7 +8798,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB36_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8849,7 +8849,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX942: .LBB36_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -8990,7 +8990,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -9031,7 +9031,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -9070,7 +9070,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -9108,7 +9108,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9152,7 +9152,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -9196,7 +9196,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX6: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -9246,7 +9246,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9306,7 +9306,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9359,7 +9359,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX942: .LBB37_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -9504,7 +9504,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -9546,7 +9546,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -9586,7 +9586,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -9625,7 +9625,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9670,7 +9670,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -9715,7 +9715,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX6: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -9767,7 +9767,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9827,7 +9827,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB38_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9881,7 +9881,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX942: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -10026,7 +10026,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10068,7 +10068,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10108,7 +10108,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10147,7 +10147,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10192,7 +10192,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10237,7 +10237,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX6: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10288,7 +10288,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB39_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10343,7 +10343,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB39_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10392,7 +10392,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX942: .LBB39_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10528,7 +10528,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10568,7 +10568,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10606,7 +10606,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10643,7 +10643,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10686,7 +10686,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10728,7 +10728,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX6: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10777,7 +10777,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10835,7 +10835,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10886,7 +10886,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX942: .LBB40_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11026,7 +11026,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11067,7 +11067,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11106,7 +11106,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11144,7 +11144,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11188,7 +11188,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11231,7 +11231,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX6: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11281,7 +11281,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11339,7 +11339,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11391,7 +11391,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX942: .LBB41_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11531,7 +11531,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11572,7 +11572,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11611,7 +11611,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11649,7 +11649,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11693,7 +11693,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11736,7 +11736,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX6: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11777,7 +11777,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB42_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -11824,7 +11824,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB42_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -11866,7 +11866,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX942: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -11979,7 +11979,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -12014,7 +12014,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -12047,7 +12047,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -12080,7 +12080,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -12116,7 +12116,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12151,7 +12151,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX6: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12191,7 +12191,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB43_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l @@ -12236,7 +12236,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB43_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12276,7 +12276,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX942: .LBB43_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12384,7 +12384,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12418,7 +12418,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12450,7 +12450,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12482,7 +12482,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12518,7 +12518,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12552,7 +12552,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX6: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12600,7 +12600,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12661,7 +12661,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB44_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12715,7 +12715,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -12860,7 +12860,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -12902,7 +12902,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -12944,7 +12944,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -12983,7 +12983,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -13028,7 +13028,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -13073,7 +13073,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX6: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -13125,7 +13125,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -13184,7 +13184,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB45_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13236,7 +13236,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942: .LBB45_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13376,7 +13376,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13417,7 +13417,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13458,7 +13458,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13496,7 +13496,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13540,7 +13540,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13583,7 +13583,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX6: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13629,7 +13629,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -13658,7 +13658,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -13684,7 +13684,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -13712,7 +13712,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -13738,7 +13738,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -13762,7 +13762,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -13787,7 +13787,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -13825,7 +13825,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13874,7 +13874,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX6: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13922,7 +13922,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -13951,7 +13951,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -13977,7 +13977,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -14005,7 +14005,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -14031,7 +14031,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14055,7 +14055,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14082,7 +14082,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14119,7 +14119,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -14168,7 +14168,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX6: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -14217,7 +14217,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14246,7 +14246,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -14272,7 +14272,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -14300,7 +14300,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -14326,7 +14326,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14350,7 +14350,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14377,7 +14377,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14418,7 +14418,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -14469,7 +14469,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -14516,7 +14516,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -14544,7 +14544,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14569,7 +14569,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14596,7 +14596,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14621,7 +14621,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14644,7 +14644,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14668,7 +14668,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -14705,7 +14705,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -14752,7 +14752,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -14798,7 +14798,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -14826,7 +14826,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14851,7 +14851,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14878,7 +14878,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14903,7 +14903,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14926,7 +14926,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14952,7 +14952,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -14989,7 +14989,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15036,7 +15036,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15083,7 +15083,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -15111,7 +15111,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15136,7 +15136,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15163,7 +15163,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15188,7 +15188,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15211,7 +15211,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15237,7 +15237,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15278,7 +15278,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15329,7 +15329,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15376,7 +15376,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -15406,7 +15406,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -15432,7 +15432,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -15460,7 +15460,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -15486,7 +15486,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -15512,7 +15512,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -15539,7 +15539,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -15576,7 +15576,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -15625,7 +15625,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -15674,7 +15674,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -15703,7 +15703,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15728,7 +15728,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15755,7 +15755,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15780,7 +15780,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15805,7 +15805,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15831,7 +15831,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15868,7 +15868,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15915,7 +15915,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15967,7 +15967,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16021,7 +16021,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16071,7 +16071,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -16210,7 +16210,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16252,7 +16252,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16292,7 +16292,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16330,7 +16330,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -16380,7 +16380,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16425,7 +16425,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16471,7 +16471,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16525,7 +16525,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16575,7 +16575,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -16714,7 +16714,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16756,7 +16756,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16796,7 +16796,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16836,7 +16836,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -16885,7 +16885,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16930,7 +16930,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16977,7 +16977,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -17031,7 +17031,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -17081,7 +17081,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX942: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -17220,7 +17220,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -17262,7 +17262,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -17302,7 +17302,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -17342,7 +17342,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -17395,7 +17395,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -17442,7 +17442,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -17487,7 +17487,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17539,7 +17539,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17588,7 +17588,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX942: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17722,7 +17722,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17763,7 +17763,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17802,7 +17802,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17839,7 +17839,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17888,7 +17888,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -17931,7 +17931,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -17975,7 +17975,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB58_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18027,7 +18027,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB58_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18076,7 +18076,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX942: .LBB58_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18210,7 +18210,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX10: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18251,7 +18251,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18290,7 +18290,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX908: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18329,7 +18329,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18378,7 +18378,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18421,7 +18421,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX6: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18466,7 +18466,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB59_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18518,7 +18518,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB59_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18567,7 +18567,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX942: .LBB59_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18701,7 +18701,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX10: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18742,7 +18742,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX90A: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18781,7 +18781,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX908: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18820,7 +18820,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18873,7 +18873,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18920,7 +18920,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX6: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18965,7 +18965,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB60_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -19020,7 +19020,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB60_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -19071,7 +19071,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX942: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -19210,7 +19210,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX10: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -19252,7 +19252,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX90A: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -19294,7 +19294,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX908: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -19334,7 +19334,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -19383,7 +19383,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -19428,7 +19428,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX6: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -19475,7 +19475,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB61_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19528,7 +19528,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB61_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19578,7 +19578,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX942: .LBB61_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19712,7 +19712,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX10: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19753,7 +19753,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19794,7 +19794,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX908: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19833,7 +19833,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19882,7 +19882,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -19925,7 +19925,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX6: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index b81af1fc9233d..25b6da3f60f65 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -36,7 +36,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -81,7 +81,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -105,7 +105,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX908: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -129,7 +129,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX8: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -198,7 +198,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -243,7 +243,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX90A: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -267,7 +267,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX908: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -293,7 +293,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX8: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -362,7 +362,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX942: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -407,7 +407,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX90A: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -431,7 +431,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX908: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -457,7 +457,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX8: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -526,7 +526,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -570,7 +570,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -593,7 +593,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX908: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -616,7 +616,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX8: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -682,7 +682,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -726,7 +726,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -749,7 +749,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -774,7 +774,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -841,7 +841,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -885,7 +885,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX90A: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -908,7 +908,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -933,7 +933,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -1001,7 +1001,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX942: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1046,7 +1046,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1072,7 +1072,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1098,7 +1098,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -1168,7 +1168,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX942: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -1212,7 +1212,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -1237,7 +1237,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -1262,7 +1262,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -1329,7 +1329,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1354,7 +1354,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -1382,7 +1382,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -1408,7 +1408,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1432,7 +1432,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1456,7 +1456,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1484,7 +1484,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -1514,7 +1514,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -1560,7 +1560,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1605,7 +1605,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1629,7 +1629,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1653,7 +1653,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1726,7 +1726,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1771,7 +1771,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1795,7 +1795,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1819,7 +1819,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1888,7 +1888,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1933,7 +1933,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1957,7 +1957,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1983,7 +1983,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -2052,7 +2052,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -2097,7 +2097,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -2121,7 +2121,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2147,7 +2147,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -2216,7 +2216,7 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2260,7 +2260,7 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2283,7 +2283,7 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2306,7 +2306,7 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2372,7 +2372,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2416,7 +2416,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2439,7 +2439,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2464,7 +2464,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2531,7 +2531,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2575,7 +2575,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2598,7 +2598,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2623,7 +2623,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -2691,7 +2691,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -2736,7 +2736,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -2762,7 +2762,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2788,7 +2788,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -2858,7 +2858,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2902,7 +2902,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2927,7 +2927,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 @@ -2952,7 +2952,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 @@ -3013,7 +3013,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3051,7 +3051,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3097,7 +3097,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -3123,7 +3123,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -3186,7 +3186,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3224,7 +3224,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3270,7 +3270,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -3298,7 +3298,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3360,7 +3360,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3398,7 +3398,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3444,7 +3444,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -3472,7 +3472,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3534,7 +3534,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3571,7 +3571,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3616,7 +3616,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3640,7 +3640,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3697,7 +3697,7 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3734,7 +3734,7 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3779,7 +3779,7 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3805,7 +3805,7 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3863,7 +3863,7 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3900,7 +3900,7 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3945,7 +3945,7 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3971,7 +3971,7 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -4029,7 +4029,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -4067,7 +4067,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -4095,7 +4095,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -4123,7 +4123,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] @@ -4148,7 +4148,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -4174,7 +4174,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -4206,7 +4206,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 ; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 @@ -4240,7 +4240,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 ; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v11, v1 @@ -4278,7 +4278,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -4316,7 +4316,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -4362,7 +4362,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -4388,7 +4388,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -4463,7 +4463,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -4510,7 +4510,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -4552,7 +4552,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -4588,7 +4588,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -4630,7 +4630,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -4670,7 +4670,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4706,7 +4706,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4740,7 +4740,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4774,7 +4774,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4813,7 +4813,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4856,7 +4856,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4908,7 +4908,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -4957,7 +4957,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5001,7 +5001,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -5040,7 +5040,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5083,7 +5083,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5124,7 +5124,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -5161,7 +5161,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -5196,7 +5196,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -5231,7 +5231,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -5271,7 +5271,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5315,7 +5315,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5369,7 +5369,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5418,7 +5418,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5463,7 +5463,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX942: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -5502,7 +5502,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5545,7 +5545,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB28_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5586,7 +5586,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -5623,7 +5623,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -5658,7 +5658,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -5693,7 +5693,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -5733,7 +5733,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5777,7 +5777,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5828,7 +5828,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB29_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5873,7 +5873,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB29_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5914,7 +5914,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX942: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5949,7 +5949,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB29_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5989,7 +5989,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB29_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6028,7 +6028,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6063,7 +6063,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -6096,7 +6096,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6129,7 +6129,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6167,7 +6167,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -6208,7 +6208,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -6259,7 +6259,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -6306,7 +6306,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6349,7 +6349,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX942: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6387,7 +6387,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -6428,7 +6428,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6468,7 +6468,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6504,7 +6504,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6538,7 +6538,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6572,7 +6572,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6611,7 +6611,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6653,7 +6653,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6705,7 +6705,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -6752,7 +6752,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6796,7 +6796,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX942: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6834,7 +6834,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -6875,7 +6875,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6915,7 +6915,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6951,7 +6951,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6985,7 +6985,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7019,7 +7019,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7058,7 +7058,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7100,7 +7100,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7141,7 +7141,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -7177,7 +7177,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -7210,7 +7210,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX942: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -7236,7 +7236,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -7267,7 +7267,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -7298,7 +7298,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -7327,7 +7327,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -7353,7 +7353,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -7380,7 +7380,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -7410,7 +7410,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -7444,7 +7444,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -7484,7 +7484,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l @@ -7518,7 +7518,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 @@ -7550,7 +7550,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX942: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7575,7 +7575,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l @@ -7604,7 +7604,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7634,7 +7634,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7662,7 +7662,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7687,7 +7687,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7713,7 +7713,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7743,7 +7743,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -7776,7 +7776,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -7826,7 +7826,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -7876,7 +7876,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -7921,7 +7921,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX942: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -7960,7 +7960,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 -; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8003,7 +8003,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB34_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8044,7 +8044,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8081,7 +8081,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -8118,7 +8118,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8153,7 +8153,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8193,7 +8193,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8237,7 +8237,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8291,7 +8291,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -8339,7 +8339,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8383,7 +8383,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX942: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8421,7 +8421,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 @@ -8462,7 +8462,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8502,7 +8502,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8538,7 +8538,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8574,7 +8574,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8608,7 +8608,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8647,7 +8647,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8689,7 +8689,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX6: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8741,7 +8741,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB36_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8798,7 +8798,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB36_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8849,7 +8849,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX942: .LBB36_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -8990,7 +8990,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -9031,7 +9031,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -9070,7 +9070,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -9108,7 +9108,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9152,7 +9152,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -9196,7 +9196,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX6: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -9246,7 +9246,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9306,7 +9306,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9359,7 +9359,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX942: .LBB37_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -9504,7 +9504,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -9546,7 +9546,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -9586,7 +9586,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -9625,7 +9625,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9670,7 +9670,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -9715,7 +9715,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX6: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -9767,7 +9767,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9827,7 +9827,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB38_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9881,7 +9881,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX942: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -10026,7 +10026,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10068,7 +10068,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10108,7 +10108,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10147,7 +10147,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10192,7 +10192,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10237,7 +10237,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX6: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10288,7 +10288,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB39_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10343,7 +10343,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB39_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10392,7 +10392,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX942: .LBB39_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10528,7 +10528,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10568,7 +10568,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10606,7 +10606,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10643,7 +10643,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10686,7 +10686,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10728,7 +10728,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX6: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10777,7 +10777,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10835,7 +10835,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10886,7 +10886,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX942: .LBB40_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11026,7 +11026,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11067,7 +11067,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11106,7 +11106,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11144,7 +11144,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11188,7 +11188,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11231,7 +11231,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX6: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11281,7 +11281,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11339,7 +11339,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11391,7 +11391,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX942: .LBB41_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11531,7 +11531,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11572,7 +11572,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11611,7 +11611,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11649,7 +11649,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11693,7 +11693,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11736,7 +11736,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX6: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11777,7 +11777,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB42_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -11824,7 +11824,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB42_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -11866,7 +11866,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX942: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -11979,7 +11979,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -12014,7 +12014,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -12047,7 +12047,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -12080,7 +12080,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -12116,7 +12116,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12151,7 +12151,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX6: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12191,7 +12191,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB43_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l @@ -12236,7 +12236,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB43_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12276,7 +12276,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX942: .LBB43_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12384,7 +12384,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12418,7 +12418,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12450,7 +12450,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12482,7 +12482,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12518,7 +12518,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12552,7 +12552,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX6: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12600,7 +12600,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12661,7 +12661,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB44_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -12715,7 +12715,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -12860,7 +12860,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -12902,7 +12902,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -12944,7 +12944,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -12983,7 +12983,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -13028,7 +13028,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -13073,7 +13073,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX6: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -13125,7 +13125,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -13184,7 +13184,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB45_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13236,7 +13236,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942: .LBB45_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13376,7 +13376,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13417,7 +13417,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13458,7 +13458,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13496,7 +13496,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13540,7 +13540,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13583,7 +13583,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX6: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13629,7 +13629,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -13658,7 +13658,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -13684,7 +13684,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -13712,7 +13712,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -13738,7 +13738,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -13762,7 +13762,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -13787,7 +13787,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -13825,7 +13825,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13874,7 +13874,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX6: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13922,7 +13922,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -13951,7 +13951,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -13977,7 +13977,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -14005,7 +14005,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -14031,7 +14031,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14055,7 +14055,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14082,7 +14082,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14119,7 +14119,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -14168,7 +14168,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX6: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -14217,7 +14217,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14246,7 +14246,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -14272,7 +14272,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -14300,7 +14300,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -14326,7 +14326,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14350,7 +14350,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14377,7 +14377,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14418,7 +14418,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -14469,7 +14469,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -14516,7 +14516,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -14544,7 +14544,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14569,7 +14569,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14596,7 +14596,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14621,7 +14621,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14644,7 +14644,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14668,7 +14668,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -14705,7 +14705,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -14752,7 +14752,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -14798,7 +14798,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -14826,7 +14826,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14851,7 +14851,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14878,7 +14878,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14903,7 +14903,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14926,7 +14926,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -14952,7 +14952,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -14989,7 +14989,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15036,7 +15036,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15083,7 +15083,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -15111,7 +15111,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15136,7 +15136,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15163,7 +15163,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15188,7 +15188,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15211,7 +15211,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15237,7 +15237,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15278,7 +15278,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15329,7 +15329,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15376,7 +15376,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -15406,7 +15406,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -15432,7 +15432,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -15460,7 +15460,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -15486,7 +15486,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -15512,7 +15512,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -15539,7 +15539,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -15576,7 +15576,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -15625,7 +15625,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -15674,7 +15674,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -15703,7 +15703,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15728,7 +15728,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15755,7 +15755,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15780,7 +15780,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15805,7 +15805,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -15831,7 +15831,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15868,7 +15868,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15915,7 +15915,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15967,7 +15967,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16021,7 +16021,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16071,7 +16071,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -16210,7 +16210,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16252,7 +16252,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16292,7 +16292,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16330,7 +16330,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -16380,7 +16380,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16425,7 +16425,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16471,7 +16471,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16525,7 +16525,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16575,7 +16575,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -16714,7 +16714,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16756,7 +16756,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16796,7 +16796,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16836,7 +16836,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -16885,7 +16885,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16930,7 +16930,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16977,7 +16977,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -17031,7 +17031,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -17081,7 +17081,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX942: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -17220,7 +17220,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -17262,7 +17262,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -17302,7 +17302,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -17342,7 +17342,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -17395,7 +17395,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -17442,7 +17442,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -17487,7 +17487,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17539,7 +17539,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17588,7 +17588,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX942: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17722,7 +17722,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17763,7 +17763,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17802,7 +17802,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17839,7 +17839,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17888,7 +17888,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -17931,7 +17931,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -17975,7 +17975,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB58_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18027,7 +18027,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB58_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18076,7 +18076,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX942: .LBB58_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18210,7 +18210,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX10: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18251,7 +18251,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18290,7 +18290,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX908: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18329,7 +18329,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18378,7 +18378,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18421,7 +18421,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX6: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18466,7 +18466,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB59_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18518,7 +18518,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB59_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18567,7 +18567,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX942: .LBB59_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18701,7 +18701,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX10: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18742,7 +18742,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX90A: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18781,7 +18781,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX908: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18820,7 +18820,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18873,7 +18873,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18920,7 +18920,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX6: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18965,7 +18965,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB60_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -19020,7 +19020,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB60_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -19071,7 +19071,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX942: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -19210,7 +19210,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX10: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -19252,7 +19252,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX90A: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -19294,7 +19294,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX908: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -19334,7 +19334,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -19383,7 +19383,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -19428,7 +19428,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX6: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -19475,7 +19475,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB61_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19528,7 +19528,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB61_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19578,7 +19578,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX942: .LBB61_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19712,7 +19712,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX10: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19753,7 +19753,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19794,7 +19794,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX908: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19833,7 +19833,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19882,7 +19882,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -19925,7 +19925,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX6: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index b8762d13e1327..23baec507d0ed 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -25,7 +25,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX12: .LBB0_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -52,7 +52,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -75,7 +75,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11: .LBB0_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -101,7 +101,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX10: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -125,7 +125,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -147,7 +147,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX908: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -169,7 +169,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX8: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -195,7 +195,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -223,7 +223,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX6: .LBB0_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -257,7 +257,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX12: .LBB1_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -284,7 +284,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -307,7 +307,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX11: .LBB1_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -333,7 +333,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX10: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -357,7 +357,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX90A: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -379,7 +379,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX908: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -403,7 +403,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX8: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -428,7 +428,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -456,7 +456,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX6: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -491,7 +491,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX12: .LBB2_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -518,7 +518,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX942: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -541,7 +541,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX11: .LBB2_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -567,7 +567,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX10: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -591,7 +591,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX90A: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -613,7 +613,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX908: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -637,7 +637,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX8: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -668,7 +668,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7: .LBB2_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v0 @@ -701,7 +701,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX6: .LBB2_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v6, v0 @@ -735,7 +735,7 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX12: .LBB3_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -760,7 +760,7 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -782,7 +782,7 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX11: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -806,7 +806,7 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX10: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -829,7 +829,7 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -850,7 +850,7 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX908: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -871,7 +871,7 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX8: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -896,7 +896,7 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -923,7 +923,7 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX6: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -956,7 +956,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX12: .LBB4_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -981,7 +981,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -1003,7 +1003,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX11: .LBB4_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1027,7 +1027,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX10: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1050,7 +1050,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -1071,7 +1071,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1094,7 +1094,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1119,7 +1119,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1146,7 +1146,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX6: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1180,7 +1180,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1205,7 +1205,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -1227,7 +1227,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1251,7 +1251,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX10: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1274,7 +1274,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX90A: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -1295,7 +1295,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1318,7 +1318,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1347,7 +1347,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7: .LBB5_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1378,7 +1378,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX6: .LBB5_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1412,7 +1412,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -1440,7 +1440,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX942: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1463,7 +1463,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -1489,7 +1489,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -1513,7 +1513,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -1537,7 +1537,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1561,7 +1561,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -1586,7 +1586,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -1614,7 +1614,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX6: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -1649,7 +1649,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1675,7 +1675,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX942: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -1697,7 +1697,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX11: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1721,7 +1721,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1744,7 +1744,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -1767,7 +1767,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1790,7 +1790,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1815,7 +1815,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1842,7 +1842,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX6: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -1880,7 +1880,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -1907,7 +1907,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -1930,7 +1930,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -1956,7 +1956,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -1980,7 +1980,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -2002,7 +2002,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2024,7 +2024,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -2050,7 +2050,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -2078,7 +2078,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -2112,7 +2112,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -2139,7 +2139,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -2162,7 +2162,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -2188,7 +2188,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -2212,7 +2212,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -2234,7 +2234,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2258,7 +2258,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -2283,7 +2283,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -2311,7 +2311,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -2346,7 +2346,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -2373,7 +2373,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -2396,7 +2396,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -2422,7 +2422,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -2446,7 +2446,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -2468,7 +2468,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -2492,7 +2492,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -2523,7 +2523,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v0 @@ -2556,7 +2556,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v6, v0 @@ -2590,7 +2590,7 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2615,7 +2615,7 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -2637,7 +2637,7 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2661,7 +2661,7 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2684,7 +2684,7 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -2705,7 +2705,7 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2726,7 +2726,7 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2751,7 +2751,7 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2778,7 +2778,7 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX6: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2811,7 +2811,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12: .LBB12_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2836,7 +2836,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -2858,7 +2858,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2882,7 +2882,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2905,7 +2905,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -2926,7 +2926,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2949,7 +2949,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -2974,7 +2974,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3001,7 +3001,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3035,7 +3035,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3060,7 +3060,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -3082,7 +3082,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3106,7 +3106,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3129,7 +3129,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -3150,7 +3150,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3173,7 +3173,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3202,7 +3202,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3233,7 +3233,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3267,7 +3267,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -3295,7 +3295,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -3318,7 +3318,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -3344,7 +3344,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -3368,7 +3368,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -3392,7 +3392,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -3416,7 +3416,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -3441,7 +3441,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 @@ -3469,7 +3469,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX6: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 @@ -3504,7 +3504,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3530,7 +3530,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -3552,7 +3552,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3576,7 +3576,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3599,7 +3599,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 @@ -3622,7 +3622,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3645,7 +3645,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3670,7 +3670,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3697,7 +3697,7 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6: .LBB15_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 @@ -3735,7 +3735,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3762,7 +3762,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] @@ -3786,7 +3786,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3812,7 +3812,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -3838,7 +3838,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] @@ -3861,7 +3861,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -3885,7 +3885,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -3917,7 +3917,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 @@ -3951,7 +3951,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v11, v1 @@ -3987,7 +3987,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -4014,7 +4014,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] @@ -4038,7 +4038,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -4064,7 +4064,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -4090,7 +4090,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] @@ -4113,7 +4113,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -4139,7 +4139,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -4169,7 +4169,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 @@ -4203,7 +4203,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v11, v1 @@ -4240,7 +4240,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -4267,7 +4267,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX942: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] @@ -4291,7 +4291,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -4317,7 +4317,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -4343,7 +4343,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] @@ -4366,7 +4366,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -4392,7 +4392,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -4426,7 +4426,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 @@ -4464,7 +4464,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX6: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v11, v1 @@ -4501,7 +4501,7 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] @@ -4526,7 +4526,7 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX942: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4548,7 +4548,7 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4572,7 +4572,7 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4596,7 +4596,7 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX90A: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4617,7 +4617,7 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4639,7 +4639,7 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4665,7 +4665,7 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4695,7 +4695,7 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX6: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4731,7 +4731,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] @@ -4756,7 +4756,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX942: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4778,7 +4778,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4802,7 +4802,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4826,7 +4826,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX90A: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4847,7 +4847,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4871,7 +4871,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4897,7 +4897,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4927,7 +4927,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX6: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4964,7 +4964,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] @@ -4989,7 +4989,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX942: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5011,7 +5011,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5035,7 +5035,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX10: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5059,7 +5059,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX90A: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5080,7 +5080,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5104,7 +5104,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5134,7 +5134,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX7: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5168,7 +5168,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX6: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -5217,7 +5217,7 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB22_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5261,7 +5261,7 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB22_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5301,7 +5301,7 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX942: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -5335,7 +5335,7 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB22_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5374,7 +5374,7 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB22_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5412,7 +5412,7 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -5446,7 +5446,7 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -5478,7 +5478,7 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -5510,7 +5510,7 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -5548,7 +5548,7 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -5591,7 +5591,7 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -5640,7 +5640,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB23_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5685,7 +5685,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB23_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5727,7 +5727,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX942: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -5762,7 +5762,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB23_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5802,7 +5802,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB23_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -5841,7 +5841,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -5876,7 +5876,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -5909,7 +5909,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -5942,7 +5942,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -5981,7 +5981,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6025,7 +6025,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6076,7 +6076,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB24_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6121,7 +6121,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB24_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6164,7 +6164,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX942: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -6199,7 +6199,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB24_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6239,7 +6239,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB24_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -6278,7 +6278,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6313,7 +6313,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -6346,7 +6346,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -6379,7 +6379,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6418,7 +6418,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6462,7 +6462,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6512,7 +6512,7 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB25_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6555,7 +6555,7 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB25_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6593,7 +6593,7 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX942: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -6626,7 +6626,7 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB25_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6664,7 +6664,7 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB25_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6700,7 +6700,7 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6733,7 +6733,7 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -6764,7 +6764,7 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6795,7 +6795,7 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6832,7 +6832,7 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -6873,7 +6873,7 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -6921,7 +6921,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -6965,7 +6965,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7005,7 +7005,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7039,7 +7039,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7078,7 +7078,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7115,7 +7115,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7149,7 +7149,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7181,7 +7181,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7213,7 +7213,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7251,7 +7251,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7293,7 +7293,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7342,7 +7342,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7386,7 +7386,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7427,7 +7427,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7461,7 +7461,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7500,7 +7500,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7537,7 +7537,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7571,7 +7571,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -7603,7 +7603,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7635,7 +7635,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -7673,7 +7673,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7715,7 +7715,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7755,7 +7755,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -7788,7 +7788,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -7819,7 +7819,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX942: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -7843,7 +7843,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -7871,7 +7871,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB28_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -7900,7 +7900,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -7927,7 +7927,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -7951,7 +7951,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -7976,7 +7976,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -8005,7 +8005,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -8039,7 +8039,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -8078,7 +8078,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB29_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l @@ -8110,7 +8110,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB29_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2 @@ -8139,7 +8139,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX942: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_sub_f16_e32 v3, v5, v2 @@ -8162,7 +8162,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB29_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l @@ -8189,7 +8189,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB29_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2 @@ -8216,7 +8216,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_f16_e32 v3, v4, v2 @@ -8242,7 +8242,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_sub_f16_e32 v3, v5, v2 @@ -8265,7 +8265,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_sub_f16_e32 v3, v4, v2 @@ -8289,7 +8289,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f16_e32 v3, v4, v2 @@ -8318,7 +8318,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -8351,7 +8351,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -8398,7 +8398,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8444,7 +8444,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8487,7 +8487,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX942: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v4 @@ -8522,7 +8522,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8562,7 +8562,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -8601,7 +8601,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8636,7 +8636,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 @@ -8671,7 +8671,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v4 @@ -8704,7 +8704,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8743,7 +8743,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8787,7 +8787,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8838,7 +8838,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8883,7 +8883,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8924,7 +8924,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX942: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -8958,7 +8958,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8997,7 +8997,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9034,7 +9034,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9068,7 +9068,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -9102,7 +9102,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9134,7 +9134,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9172,7 +9172,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9214,7 +9214,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9266,7 +9266,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9323,7 +9323,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9374,7 +9374,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX942: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -9515,7 +9515,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -9556,7 +9556,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -9595,7 +9595,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -9633,7 +9633,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9677,7 +9677,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -9720,7 +9720,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -9769,7 +9769,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9829,7 +9829,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -9882,7 +9882,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX942: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -10027,7 +10027,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10069,7 +10069,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10109,7 +10109,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10148,7 +10148,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10193,7 +10193,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10237,7 +10237,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10288,7 +10288,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -10348,7 +10348,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -10402,7 +10402,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX942: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -10547,7 +10547,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10589,7 +10589,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10629,7 +10629,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10668,7 +10668,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10713,7 +10713,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10757,7 +10757,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10807,7 +10807,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10862,7 +10862,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -10911,7 +10911,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX942: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11047,7 +11047,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11087,7 +11087,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11125,7 +11125,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11162,7 +11162,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11205,7 +11205,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11246,7 +11246,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX6: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11294,7 +11294,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB36_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11352,7 +11352,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB36_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11403,7 +11403,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX942: .LBB36_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11543,7 +11543,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11584,7 +11584,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11623,7 +11623,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11661,7 +11661,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11705,7 +11705,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11747,7 +11747,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX6: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11796,7 +11796,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -11854,7 +11854,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11906,7 +11906,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX942: .LBB37_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12046,7 +12046,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12087,7 +12087,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12126,7 +12126,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12164,7 +12164,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12208,7 +12208,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -12250,7 +12250,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX6: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -12290,7 +12290,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -12337,7 +12337,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB38_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -12379,7 +12379,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX942: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -12492,7 +12492,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -12527,7 +12527,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -12560,7 +12560,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -12593,7 +12593,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -12629,7 +12629,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12663,7 +12663,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX6: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12702,7 +12702,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB39_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l @@ -12747,7 +12747,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB39_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12787,7 +12787,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX942: .LBB39_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12895,7 +12895,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12929,7 +12929,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12961,7 +12961,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -12993,7 +12993,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13029,7 +13029,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13062,7 +13062,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX6: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -13109,7 +13109,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 @@ -13170,7 +13170,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 @@ -13224,7 +13224,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX942: .LBB40_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 @@ -13369,7 +13369,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -13411,7 +13411,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -13453,7 +13453,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -13492,7 +13492,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -13537,7 +13537,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -13581,7 +13581,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX6: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -13632,7 +13632,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -13691,7 +13691,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -13743,7 +13743,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX942: .LBB41_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13883,7 +13883,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13924,7 +13924,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -13965,7 +13965,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14003,7 +14003,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -14047,7 +14047,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -14089,7 +14089,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX6: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -14133,7 +14133,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12: .LBB42_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14160,7 +14160,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX942: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -14183,7 +14183,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -14209,7 +14209,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -14233,7 +14233,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14255,7 +14255,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14277,7 +14277,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -14313,7 +14313,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -14362,7 +14362,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX6: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -14409,7 +14409,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12: .LBB43_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14436,7 +14436,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX942: .LBB43_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -14459,7 +14459,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -14485,7 +14485,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -14509,7 +14509,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14531,7 +14531,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14555,7 +14555,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -14590,7 +14590,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -14639,7 +14639,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX6: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -14687,7 +14687,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -14714,7 +14714,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -14737,7 +14737,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -14763,7 +14763,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -14787,7 +14787,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -14809,7 +14809,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -14833,7 +14833,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -14872,7 +14872,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -14923,7 +14923,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX6: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -14969,7 +14969,7 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -14994,7 +14994,7 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942: .LBB45_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15016,7 +15016,7 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15040,7 +15040,7 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15063,7 +15063,7 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15084,7 +15084,7 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15105,7 +15105,7 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15140,7 +15140,7 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15187,7 +15187,7 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX6: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15232,7 +15232,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15257,7 +15257,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15279,7 +15279,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15303,7 +15303,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15326,7 +15326,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15347,7 +15347,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15370,7 +15370,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15405,7 +15405,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15452,7 +15452,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX6: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15498,7 +15498,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15523,7 +15523,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15545,7 +15545,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15569,7 +15569,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15592,7 +15592,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15613,7 +15613,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -15636,7 +15636,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -15675,7 +15675,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15726,7 +15726,7 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX6: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -15772,7 +15772,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -15800,7 +15800,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 @@ -15823,7 +15823,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -15849,7 +15849,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -15873,7 +15873,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -15897,7 +15897,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -15921,7 +15921,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 @@ -15956,7 +15956,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -16005,7 +16005,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -16053,7 +16053,7 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -16079,7 +16079,7 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -16101,7 +16101,7 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -16125,7 +16125,7 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -16148,7 +16148,7 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -16171,7 +16171,7 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -16194,7 +16194,7 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -16229,7 +16229,7 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -16276,7 +16276,7 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -16328,7 +16328,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB50_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16382,7 +16382,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB50_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16432,7 +16432,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -16571,7 +16571,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16613,7 +16613,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16653,7 +16653,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16691,7 +16691,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -16741,7 +16741,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16786,7 +16786,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16832,7 +16832,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB51_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16886,7 +16886,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB51_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -16936,7 +16936,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -17075,7 +17075,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -17117,7 +17117,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -17157,7 +17157,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -17197,7 +17197,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -17246,7 +17246,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -17291,7 +17291,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -17338,7 +17338,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB52_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -17392,7 +17392,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB52_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -17442,7 +17442,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -17581,7 +17581,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -17623,7 +17623,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -17663,7 +17663,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -17703,7 +17703,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -17756,7 +17756,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -17803,7 +17803,7 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -17848,7 +17848,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB53_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17900,7 +17900,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB53_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17949,7 +17949,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18083,7 +18083,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18124,7 +18124,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18163,7 +18163,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18200,7 +18200,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18249,7 +18249,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18292,7 +18292,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18336,7 +18336,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18388,7 +18388,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18437,7 +18437,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18571,7 +18571,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18612,7 +18612,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18651,7 +18651,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18690,7 +18690,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18739,7 +18739,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18782,7 +18782,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18827,7 +18827,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18879,7 +18879,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -18928,7 +18928,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19062,7 +19062,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19103,7 +19103,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19142,7 +19142,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19181,7 +19181,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19234,7 +19234,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -19281,7 +19281,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -19326,7 +19326,7 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -19381,7 +19381,7 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -19432,7 +19432,7 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX942: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v3 @@ -19571,7 +19571,7 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -19613,7 +19613,7 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -19655,7 +19655,7 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -19695,7 +19695,7 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -19744,7 +19744,7 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -19789,7 +19789,7 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -19836,7 +19836,7 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19889,7 +19889,7 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19939,7 +19939,7 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX942: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20073,7 +20073,7 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20114,7 +20114,7 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20155,7 +20155,7 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20194,7 +20194,7 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20243,7 +20243,7 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -20286,7 +20286,7 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll index 6fe9e1d5561de..8f3dc6ef7646e 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -23,6 +23,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocap ; GCN-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_1: ; %bb3 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: global_load_dword v3, v[0:1], off glc @@ -61,6 +62,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: s_movk_i32 s1, 0x100 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB1_1: ; %bb3 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_lshlrev_b64 v[3:4], 1, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll index b7ee9f70f6014..7c8c5f70aa1c7 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -19,6 +19,7 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -42,6 +43,7 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -68,6 +70,7 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -95,6 +98,7 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -132,6 +136,7 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -155,6 +160,7 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -181,6 +187,7 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -208,6 +215,7 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -245,6 +253,7 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -266,6 +275,7 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -290,6 +300,7 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -314,6 +325,7 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -346,6 +358,7 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -367,6 +380,7 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -391,6 +405,7 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -415,6 +430,7 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -448,6 +464,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -475,6 +492,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -505,6 +523,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -536,6 +555,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -578,6 +598,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -605,6 +626,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -635,6 +657,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -666,6 +689,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -709,6 +733,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -733,6 +758,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -760,6 +786,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -787,6 +814,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -823,6 +851,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -847,6 +876,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -874,6 +904,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -901,6 +932,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -943,6 +975,7 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -966,6 +999,7 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -992,6 +1026,7 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1019,6 +1054,7 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1056,6 +1092,7 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1079,6 +1116,7 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1105,6 +1143,7 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1132,6 +1171,7 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1169,6 +1209,7 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1190,6 +1231,7 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1214,6 +1256,7 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1238,6 +1281,7 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1270,6 +1314,7 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1291,6 +1336,7 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1315,6 +1361,7 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1339,6 +1386,7 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1372,6 +1420,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1399,6 +1448,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1429,6 +1479,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1460,6 +1511,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1502,6 +1554,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1529,6 +1582,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1559,6 +1613,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1590,6 +1645,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1633,6 +1689,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1657,6 +1714,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1684,6 +1742,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1711,6 +1770,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1747,6 +1807,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1771,6 +1832,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1798,6 +1860,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1825,6 +1888,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1867,6 +1931,7 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1890,6 +1955,7 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1916,6 +1982,7 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1943,6 +2010,7 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1980,6 +2048,7 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2003,6 +2072,7 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2029,6 +2099,7 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2056,6 +2127,7 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2093,6 +2165,7 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2114,6 +2187,7 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2138,6 +2212,7 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2162,6 +2237,7 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2194,6 +2270,7 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2215,6 +2292,7 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2239,6 +2317,7 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2263,6 +2342,7 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2296,6 +2376,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2323,6 +2404,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2353,6 +2435,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2384,6 +2467,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2426,6 +2510,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2453,6 +2538,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2483,6 +2569,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2514,6 +2601,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2557,6 +2645,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2581,6 +2670,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2608,6 +2698,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2635,6 +2726,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2671,6 +2763,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2695,6 +2788,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2722,6 +2816,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2749,6 +2844,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2791,6 +2887,7 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2814,6 +2911,7 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2840,6 +2938,7 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2867,6 +2966,7 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2904,6 +3004,7 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2927,6 +3028,7 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2953,6 +3055,7 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2980,6 +3083,7 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3017,6 +3121,7 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3038,6 +3143,7 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3062,6 +3168,7 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3086,6 +3193,7 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3118,6 +3226,7 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3139,6 +3248,7 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3163,6 +3273,7 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3187,6 +3298,7 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3220,6 +3332,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3247,6 +3360,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3277,6 +3391,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3308,6 +3423,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3350,6 +3466,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3377,6 +3494,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3407,6 +3525,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3438,6 +3557,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3481,6 +3601,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3505,6 +3626,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3532,6 +3654,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3559,6 +3682,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3595,6 +3719,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3619,6 +3744,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3646,6 +3772,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3673,6 +3800,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 1602e31d6147c..2f3fd9d784d1d 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4810,6 +4810,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB132_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_add_u32 s4, s2, s0 @@ -4827,6 +4828,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB132_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -4845,6 +4847,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB132_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -4863,6 +4866,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX12-SDAG: ; %bb.0: ; %bb ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-SDAG-NEXT: .p2align ; GFX12-SDAG-NEXT: .LBB132_1: ; %bb3 ; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -4882,6 +4886,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-GISEL-NEXT: .p2align ; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -4920,6 +4925,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB133_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_add_u32 s4, s2, s0 @@ -4940,6 +4946,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB133_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -4961,6 +4968,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB133_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -4981,6 +4989,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX12-SDAG: ; %bb.0: ; %bb ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-SDAG-NEXT: .p2align ; GFX12-SDAG-NEXT: .LBB133_1: ; %bb3 ; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -5002,6 +5011,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-GISEL-NEXT: .p2align ; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index da132d0269e6b..30a8e4f6644ee 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -9459,7 +9459,7 @@ define amdgpu_kernel void @atomic_sub_i16_soffset__amdgpu_no_remote_memory(ptr a ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: .LBB136_1: ; %atomicrmw.start +; SI: .LBB136_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_subrev_i32_e32 v0, vcc, s7, v1 ; SI-NEXT: v_and_b32_e32 v0, s6, v0 @@ -9499,7 +9499,7 @@ define amdgpu_kernel void @atomic_sub_i16_soffset__amdgpu_no_remote_memory(ptr a ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: .LBB136_1: ; %atomicrmw.start +; VI: .LBB136_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_subrev_u32_e32 v0, vcc, s8, v1 ; VI-NEXT: v_and_b32_e32 v2, s7, v1 @@ -9537,7 +9537,7 @@ define amdgpu_kernel void @atomic_sub_i16_soffset__amdgpu_no_remote_memory(ptr a ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX9: .LBB136_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 ; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 @@ -9578,7 +9578,7 @@ define amdgpu_kernel void @atomic_sub_i8_soffset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: .LBB137_1: ; %atomicrmw.start +; SI: .LBB137_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_subrev_i32_e32 v0, vcc, s7, v1 ; SI-NEXT: v_and_b32_e32 v0, s6, v0 @@ -9618,7 +9618,7 @@ define amdgpu_kernel void @atomic_sub_i8_soffset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: .LBB137_1: ; %atomicrmw.start +; VI: .LBB137_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_subrev_u32_e32 v0, vcc, s8, v1 ; VI-NEXT: v_and_b32_e32 v2, s7, v1 @@ -9656,7 +9656,7 @@ define amdgpu_kernel void @atomic_sub_i8_soffset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX9: .LBB137_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 ; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index ffab56847edca..08853d5d8d0cf 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -1286,7 +1286,7 @@ define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB30_1: ; %atomicrmw.start +; SI: .LBB30_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 @@ -1311,7 +1311,7 @@ define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI: .LBB30_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 @@ -1332,7 +1332,7 @@ define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9: .LBB30_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 @@ -1361,7 +1361,7 @@ define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB31_1: ; %atomicrmw.start +; SI: .LBB31_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 @@ -1388,7 +1388,7 @@ define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB31_1: ; %atomicrmw.start +; VI: .LBB31_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 @@ -1409,7 +1409,7 @@ define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9: .LBB31_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 @@ -1439,7 +1439,7 @@ define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB32_1: ; %atomicrmw.start +; SI: .LBB32_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -1465,7 +1465,7 @@ define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB32_1: ; %atomicrmw.start +; VI: .LBB32_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -1487,7 +1487,7 @@ define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9: .LBB32_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -1517,7 +1517,7 @@ define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB33_1: ; %atomicrmw.start +; SI: .LBB33_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -1545,7 +1545,7 @@ define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB33_1: ; %atomicrmw.start +; VI: .LBB33_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -1566,7 +1566,7 @@ define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9: .LBB33_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -1602,7 +1602,7 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB34_1: ; %atomicrmw.start +; SI: .LBB34_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v1 @@ -1634,7 +1634,7 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI: .LBB34_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 @@ -1656,7 +1656,7 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9: .LBB34_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 @@ -1690,7 +1690,7 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB35_1: ; %atomicrmw.start +; SI: .LBB35_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v1 @@ -1724,7 +1724,7 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB35_1: ; %atomicrmw.start +; VI: .LBB35_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 @@ -1746,7 +1746,7 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9: .LBB35_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 @@ -1781,7 +1781,7 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB36_1: ; %atomicrmw.start +; SI: .LBB36_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -1815,7 +1815,7 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI: .LBB36_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -1837,7 +1837,7 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9: .LBB36_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -1871,7 +1871,7 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB37_1: ; %atomicrmw.start +; SI: .LBB37_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -1905,7 +1905,7 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI: .LBB37_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -1927,7 +1927,7 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9: .LBB37_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -2071,7 +2071,7 @@ define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB41_1: ; %atomicrmw.start +; SI: .LBB41_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, v4, v2 @@ -2096,7 +2096,7 @@ define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB41_1: ; %atomicrmw.start +; VI: .LBB41_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v3, v4, v2 @@ -2117,7 +2117,7 @@ define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9: .LBB41_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 @@ -2146,7 +2146,7 @@ define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB42_1: ; %atomicrmw.start +; SI: .LBB42_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, v4, v2 @@ -2173,7 +2173,7 @@ define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB42_1: ; %atomicrmw.start +; VI: .LBB42_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v3, v4, v2 @@ -2194,7 +2194,7 @@ define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9: .LBB42_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 @@ -2224,7 +2224,7 @@ define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB43_1: ; %atomicrmw.start +; SI: .LBB43_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -2250,7 +2250,7 @@ define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB43_1: ; %atomicrmw.start +; VI: .LBB43_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -2272,7 +2272,7 @@ define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9: .LBB43_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -2302,7 +2302,7 @@ define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB44_1: ; %atomicrmw.start +; SI: .LBB44_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -2330,7 +2330,7 @@ define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB44_1: ; %atomicrmw.start +; VI: .LBB44_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -2351,7 +2351,7 @@ define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9: .LBB44_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -2387,7 +2387,7 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB45_1: ; %atomicrmw.start +; SI: .LBB45_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s34, v1 @@ -2419,7 +2419,7 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB45_1: ; %atomicrmw.start +; VI: .LBB45_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, s6, v3 @@ -2441,7 +2441,7 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9: .LBB45_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 @@ -2475,7 +2475,7 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB46_1: ; %atomicrmw.start +; SI: .LBB46_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s34, v1 @@ -2509,7 +2509,7 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB46_1: ; %atomicrmw.start +; VI: .LBB46_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, s6, v3 @@ -2531,7 +2531,7 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9: .LBB46_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 @@ -2566,7 +2566,7 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB47_1: ; %atomicrmw.start +; SI: .LBB47_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -2600,7 +2600,7 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB47_1: ; %atomicrmw.start +; VI: .LBB47_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -2622,7 +2622,7 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9: .LBB47_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -2656,7 +2656,7 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB48_1: ; %atomicrmw.start +; SI: .LBB48_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -2690,7 +2690,7 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB48_1: ; %atomicrmw.start +; VI: .LBB48_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -2712,7 +2712,7 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX9: .LBB48_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -2819,6 +2819,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB51_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -2845,6 +2846,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB51_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2867,6 +2869,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2897,6 +2900,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB52_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -2925,6 +2929,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB52_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2947,6 +2952,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2978,6 +2984,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB53_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3005,6 +3012,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB53_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3028,6 +3036,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3059,6 +3068,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB54_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3088,6 +3098,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB54_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3110,6 +3121,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3147,6 +3159,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB55_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3180,6 +3193,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB55_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3203,6 +3217,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3238,6 +3253,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB56_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3273,6 +3289,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3296,6 +3313,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3332,6 +3350,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB57_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3367,6 +3386,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3390,6 +3410,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3425,6 +3446,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB58_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3460,6 +3482,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB58_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3483,6 +3506,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3514,6 +3538,7 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB59_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3542,6 +3567,7 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB59_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3564,6 +3590,7 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3595,6 +3622,7 @@ define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB60_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3624,6 +3652,7 @@ define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB60_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3646,6 +3675,7 @@ define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3682,7 +3712,7 @@ define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB61_1: ; %atomicrmw.start +; SI: .LBB61_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v3, v4, v2 @@ -3707,7 +3737,7 @@ define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB61_1: ; %atomicrmw.start +; VI: .LBB61_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v3, v4, v2 @@ -3728,7 +3758,7 @@ define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9: .LBB61_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 @@ -3757,7 +3787,7 @@ define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB62_1: ; %atomicrmw.start +; SI: .LBB62_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v3, v4, v2 @@ -3784,7 +3814,7 @@ define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB62_1: ; %atomicrmw.start +; VI: .LBB62_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v3, v4, v2 @@ -3805,7 +3835,7 @@ define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9: .LBB62_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 @@ -3835,7 +3865,7 @@ define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB63_1: ; %atomicrmw.start +; SI: .LBB63_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -3861,7 +3891,7 @@ define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB63_1: ; %atomicrmw.start +; VI: .LBB63_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -3883,7 +3913,7 @@ define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9: .LBB63_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -3913,7 +3943,7 @@ define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB64_1: ; %atomicrmw.start +; SI: .LBB64_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -3941,7 +3971,7 @@ define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB64_1: ; %atomicrmw.start +; VI: .LBB64_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -3962,7 +3992,7 @@ define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX9: .LBB64_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -3998,7 +4028,7 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB65_1: ; %atomicrmw.start +; SI: .LBB65_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, s34, v1 @@ -4030,7 +4060,7 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB65_1: ; %atomicrmw.start +; VI: .LBB65_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v2, s6, v3 @@ -4052,7 +4082,7 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX9: .LBB65_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v0, s6, v1 @@ -4086,7 +4116,7 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB66_1: ; %atomicrmw.start +; SI: .LBB66_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, s34, v1 @@ -4120,7 +4150,7 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB66_1: ; %atomicrmw.start +; VI: .LBB66_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v2, s6, v3 @@ -4142,7 +4172,7 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX9: .LBB66_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v0, s6, v1 @@ -4177,7 +4207,7 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB67_1: ; %atomicrmw.start +; SI: .LBB67_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -4211,7 +4241,7 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB67_1: ; %atomicrmw.start +; VI: .LBB67_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -4233,7 +4263,7 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9: .LBB67_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -4267,7 +4297,7 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB68_1: ; %atomicrmw.start +; SI: .LBB68_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -4301,7 +4331,7 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB68_1: ; %atomicrmw.start +; VI: .LBB68_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -4323,7 +4353,7 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX9: .LBB68_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -4467,7 +4497,7 @@ define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB72_1: ; %atomicrmw.start +; SI: .LBB72_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v3, v4, v2 @@ -4492,7 +4522,7 @@ define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB72_1: ; %atomicrmw.start +; VI: .LBB72_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v3, v4, v2 @@ -4513,7 +4543,7 @@ define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9: .LBB72_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 @@ -4542,7 +4572,7 @@ define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB73_1: ; %atomicrmw.start +; SI: .LBB73_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v3, v4, v2 @@ -4569,7 +4599,7 @@ define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB73_1: ; %atomicrmw.start +; VI: .LBB73_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v3, v4, v2 @@ -4590,7 +4620,7 @@ define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9: .LBB73_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 @@ -4620,7 +4650,7 @@ define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB74_1: ; %atomicrmw.start +; SI: .LBB74_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -4646,7 +4676,7 @@ define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB74_1: ; %atomicrmw.start +; VI: .LBB74_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -4668,7 +4698,7 @@ define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9: .LBB74_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -4698,7 +4728,7 @@ define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB75_1: ; %atomicrmw.start +; SI: .LBB75_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -4726,7 +4756,7 @@ define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB75_1: ; %atomicrmw.start +; VI: .LBB75_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -4747,7 +4777,7 @@ define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9: .LBB75_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -4783,7 +4813,7 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB76_1: ; %atomicrmw.start +; SI: .LBB76_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, s34, v1 @@ -4815,7 +4845,7 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB76_1: ; %atomicrmw.start +; VI: .LBB76_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v2, s6, v3 @@ -4837,7 +4867,7 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9: .LBB76_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v1 @@ -4871,7 +4901,7 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB77_1: ; %atomicrmw.start +; SI: .LBB77_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, s34, v1 @@ -4905,7 +4935,7 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB77_1: ; %atomicrmw.start +; VI: .LBB77_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v2, s6, v3 @@ -4927,7 +4957,7 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9: .LBB77_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v1 @@ -4962,7 +4992,7 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB78_1: ; %atomicrmw.start +; SI: .LBB78_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -4996,7 +5026,7 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB78_1: ; %atomicrmw.start +; VI: .LBB78_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -5018,7 +5048,7 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX9: .LBB78_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -5052,7 +5082,7 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB79_1: ; %atomicrmw.start +; SI: .LBB79_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -5086,7 +5116,7 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB79_1: ; %atomicrmw.start +; VI: .LBB79_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -5108,7 +5138,7 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX9: .LBB79_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -5252,6 +5282,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB83_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5277,6 +5308,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB83_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5298,6 +5330,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5327,6 +5360,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB84_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5354,6 +5388,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB84_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5375,6 +5410,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5405,6 +5441,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB85_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5431,6 +5468,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB85_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5453,6 +5491,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5483,6 +5522,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB86_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5511,6 +5551,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB86_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5532,6 +5573,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5568,6 +5610,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB87_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5600,6 +5643,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB87_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5622,6 +5666,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5656,6 +5701,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB88_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5690,6 +5736,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5712,6 +5759,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5747,6 +5795,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB89_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5781,6 +5830,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5803,6 +5853,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5837,6 +5888,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB90_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5871,6 +5923,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB90_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5893,6 +5946,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5929,6 +5983,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_i32_e32 v0, s2, v1 @@ -5963,6 +6018,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_max_i32_e32 v2, s2, v3 @@ -5991,6 +6047,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 @@ -6028,6 +6085,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB92_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_i32_e32 v0, s8, v1 @@ -6069,6 +6127,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB92_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 @@ -6102,6 +6161,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -6142,6 +6202,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB93_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_i32_e32 v0, s2, v1 @@ -6174,6 +6235,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: .p2align ; VI-NEXT: .LBB93_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_max_i32_e32 v2, s2, v3 @@ -6202,6 +6264,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 @@ -6238,6 +6301,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB94_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_i32_e32 v0, s8, v1 @@ -6277,6 +6341,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: .p2align ; VI-NEXT: .LBB94_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 @@ -6310,6 +6375,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -6420,6 +6486,7 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB97_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6445,6 +6512,7 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB97_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6466,6 +6534,7 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6495,6 +6564,7 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB98_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6522,6 +6592,7 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB98_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6543,6 +6614,7 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6573,6 +6645,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB99_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6599,6 +6672,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB99_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6621,6 +6695,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6651,6 +6726,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB100_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6679,6 +6755,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB100_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6700,6 +6777,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6736,6 +6814,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB101_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6768,6 +6847,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB101_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6790,6 +6870,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6824,6 +6905,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6858,6 +6940,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6880,6 +6963,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6915,6 +6999,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB103_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6949,6 +7034,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6971,6 +7057,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7005,6 +7092,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB104_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7039,6 +7127,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7061,6 +7150,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7097,6 +7187,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB105_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_u32_e32 v0, s2, v1 @@ -7131,6 +7222,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB105_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_max_u32_e32 v2, s2, v3 @@ -7159,6 +7251,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_u32_e32 v0, s2, v1 @@ -7196,6 +7289,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB106_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_u32_e32 v0, s8, v1 @@ -7237,6 +7331,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB106_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 @@ -7270,6 +7365,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -7311,6 +7407,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB107_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_u32_e32 v0, s8, v1 @@ -7350,6 +7447,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: .p2align ; VI-NEXT: .LBB107_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 @@ -7383,6 +7481,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -7493,6 +7592,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB110_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7518,6 +7618,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB110_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7539,6 +7640,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7568,6 +7670,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB111_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7595,6 +7698,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB111_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7616,6 +7720,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7646,6 +7751,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB112_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7672,6 +7778,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB112_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7694,6 +7801,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7724,6 +7832,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB113_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7752,6 +7861,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB113_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7773,6 +7883,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7809,6 +7920,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB114_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7841,6 +7953,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB114_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7863,6 +7976,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7897,6 +8011,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB115_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7931,6 +8046,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB115_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7953,6 +8069,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7988,6 +8105,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB116_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8022,6 +8140,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB116_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8044,6 +8163,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8078,6 +8198,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB117_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8112,6 +8233,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB117_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8134,6 +8256,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8241,6 +8364,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB120_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8266,6 +8390,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB120_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8287,6 +8412,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8316,6 +8442,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB121_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8343,6 +8470,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB121_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8364,6 +8492,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8394,6 +8523,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB122_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8420,6 +8550,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB122_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8442,6 +8573,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8472,6 +8604,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB123_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8500,6 +8633,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB123_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8521,6 +8655,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8557,6 +8692,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB124_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8589,6 +8725,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB124_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8611,6 +8748,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8645,6 +8783,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB125_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8679,6 +8818,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8701,6 +8841,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8736,6 +8877,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB126_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8770,6 +8912,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8792,6 +8935,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8826,6 +8970,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB127_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8860,6 +9005,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB127_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8882,6 +9028,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8918,6 +9065,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB128_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_min_i32_e32 v0, s2, v1 @@ -8952,6 +9100,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_min_i32_e32 v2, s2, v3 @@ -8980,6 +9129,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_min_i32_e32 v0, s2, v1 @@ -9017,6 +9167,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB129_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_min_i32_e32 v0, s8, v1 @@ -9058,6 +9209,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB129_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 @@ -9091,6 +9243,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -9127,6 +9280,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB130_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_min_i32_e32 v0, s6, v1 @@ -9155,6 +9309,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: .p2align ; VI-NEXT: .LBB130_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_min_i32_e32 v2, s2, v3 @@ -9179,6 +9334,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 @@ -9214,6 +9370,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB131_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_min_i32_e32 v0, s8, v1 @@ -9253,6 +9410,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: .p2align ; VI-NEXT: .LBB131_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 @@ -9286,6 +9444,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -9396,7 +9555,7 @@ define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB134_1: ; %atomicrmw.start +; SI: .LBB134_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4 @@ -9423,7 +9582,7 @@ define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB134_1: ; %atomicrmw.start +; VI: .LBB134_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 @@ -9446,7 +9605,7 @@ define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX9: .LBB134_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 @@ -9477,7 +9636,7 @@ define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB135_1: ; %atomicrmw.start +; SI: .LBB135_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4 @@ -9506,7 +9665,7 @@ define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB135_1: ; %atomicrmw.start +; VI: .LBB135_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 @@ -9529,7 +9688,7 @@ define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX9: .LBB135_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 @@ -9561,7 +9720,7 @@ define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB136_1: ; %atomicrmw.start +; SI: .LBB136_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -9589,7 +9748,7 @@ define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB136_1: ; %atomicrmw.start +; VI: .LBB136_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -9613,7 +9772,7 @@ define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX9: .LBB136_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -9645,7 +9804,7 @@ define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB137_1: ; %atomicrmw.start +; SI: .LBB137_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -9675,7 +9834,7 @@ define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB137_1: ; %atomicrmw.start +; VI: .LBB137_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -9698,7 +9857,7 @@ define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX9: .LBB137_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -9736,7 +9895,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB138_1: ; %atomicrmw.start +; SI: .LBB138_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 @@ -9770,7 +9929,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB138_1: ; %atomicrmw.start +; VI: .LBB138_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3 @@ -9794,7 +9953,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX9: .LBB138_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 @@ -9830,7 +9989,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB139_1: ; %atomicrmw.start +; SI: .LBB139_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 @@ -9866,7 +10025,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB139_1: ; %atomicrmw.start +; VI: .LBB139_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3 @@ -9890,7 +10049,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB139_1: ; %atomicrmw.start +; GFX9: .LBB139_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 @@ -9927,7 +10086,7 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB140_1: ; %atomicrmw.start +; SI: .LBB140_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -9963,7 +10122,7 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB140_1: ; %atomicrmw.start +; VI: .LBB140_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -9987,7 +10146,7 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB140_1: ; %atomicrmw.start +; GFX9: .LBB140_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -10023,7 +10182,7 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB141_1: ; %atomicrmw.start +; SI: .LBB141_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -10059,7 +10218,7 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB141_1: ; %atomicrmw.start +; VI: .LBB141_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -10083,7 +10242,7 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX9: .LBB141_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -10192,7 +10351,7 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s9, s10 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_mov_b64 s[6:7], 0 -; SI-NEXT: .LBB144_1: ; %atomicrmw.start +; SI: .LBB144_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 @@ -10221,7 +10380,7 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[6:7], 0 -; VI-NEXT: .LBB144_1: ; %atomicrmw.start +; VI: .LBB144_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 @@ -10246,7 +10405,7 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX9: .LBB144_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 @@ -10279,7 +10438,7 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; SI-NEXT: s_mov_b32 s9, s10 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[6:7], 0 -; SI-NEXT: .LBB145_1: ; %atomicrmw.start +; SI: .LBB145_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 @@ -10310,7 +10469,7 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[6:7], 0 -; VI-NEXT: .LBB145_1: ; %atomicrmw.start +; VI: .LBB145_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 @@ -10335,7 +10494,7 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX9: .LBB145_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 @@ -10369,7 +10528,7 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s9, s10 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_mov_b64 s[6:7], 0 -; SI-NEXT: .LBB146_1: ; %atomicrmw.start +; SI: .LBB146_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -10399,7 +10558,7 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[6:7], 0 -; VI-NEXT: .LBB146_1: ; %atomicrmw.start +; VI: .LBB146_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -10425,7 +10584,7 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX9: .LBB146_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -10459,7 +10618,7 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; SI-NEXT: s_mov_b32 s9, s10 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[6:7], 0 -; SI-NEXT: .LBB147_1: ; %atomicrmw.start +; SI: .LBB147_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -10491,7 +10650,7 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[6:7], 0 -; VI-NEXT: .LBB147_1: ; %atomicrmw.start +; VI: .LBB147_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -10516,7 +10675,7 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX9: .LBB147_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -10557,7 +10716,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[38:39], 0 ; SI-NEXT: v_mov_b32_e32 v2, s34 -; SI-NEXT: .LBB148_1: ; %atomicrmw.start +; SI: .LBB148_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v1 @@ -10594,7 +10753,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: .LBB148_1: ; %atomicrmw.start +; VI: .LBB148_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3 @@ -10621,7 +10780,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[36:37], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX9: .LBB148_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -10660,7 +10819,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[38:39], 0 ; SI-NEXT: v_mov_b32_e32 v2, s34 -; SI-NEXT: .LBB149_1: ; %atomicrmw.start +; SI: .LBB149_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v1 @@ -10699,7 +10858,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: .LBB149_1: ; %atomicrmw.start +; VI: .LBB149_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3 @@ -10726,7 +10885,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[36:37], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: .LBB149_1: ; %atomicrmw.start +; GFX9: .LBB149_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -10766,7 +10925,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[38:39], 0 ; SI-NEXT: v_mov_b32_e32 v2, s34 -; SI-NEXT: .LBB150_1: ; %atomicrmw.start +; SI: .LBB150_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, v0 @@ -10805,7 +10964,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: v_mov_b32_e32 v3, s6 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB150_1: ; %atomicrmw.start +; VI: .LBB150_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, v0 @@ -10832,7 +10991,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[36:37], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: .LBB150_1: ; %atomicrmw.start +; GFX9: .LBB150_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v0 @@ -10871,7 +11030,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[38:39], 0 ; SI-NEXT: v_mov_b32_e32 v2, s34 -; SI-NEXT: .LBB151_1: ; %atomicrmw.start +; SI: .LBB151_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, v0 @@ -10910,7 +11069,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: v_mov_b32_e32 v3, s6 -; VI-NEXT: .LBB151_1: ; %atomicrmw.start +; VI: .LBB151_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, v0 @@ -10937,7 +11096,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[36:37], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: .LBB151_1: ; %atomicrmw.start +; GFX9: .LBB151_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 74f0f64c935b4..00c3fe1b804a1 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -1331,7 +1331,7 @@ define void @global_atomic_sub_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB30_1: ; %atomicrmw.start +; SI: .LBB30_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 @@ -1360,7 +1360,7 @@ define void @global_atomic_sub_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI: .LBB30_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 @@ -1383,7 +1383,7 @@ define void @global_atomic_sub_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9: .LBB30_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 @@ -1414,7 +1414,7 @@ define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB31_1: ; %atomicrmw.start +; SI: .LBB31_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 @@ -1445,7 +1445,7 @@ define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB31_1: ; %atomicrmw.start +; VI: .LBB31_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 @@ -1468,7 +1468,7 @@ define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9: .LBB31_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 @@ -1504,7 +1504,7 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB32_1: ; %atomicrmw.start +; SI: .LBB32_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -1533,7 +1533,7 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB32_1: ; %atomicrmw.start +; VI: .LBB32_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -1558,7 +1558,7 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9: .LBB32_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -1595,7 +1595,7 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB33_1: ; %atomicrmw.start +; SI: .LBB33_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -1626,7 +1626,7 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB33_1: ; %atomicrmw.start +; VI: .LBB33_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -1649,7 +1649,7 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9: .LBB33_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -1690,7 +1690,7 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: .LBB34_1: ; %atomicrmw.start +; SI: .LBB34_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v2 @@ -1729,7 +1729,7 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI: .LBB34_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 @@ -1754,7 +1754,7 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9: .LBB34_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 @@ -1792,7 +1792,7 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: .LBB35_1: ; %atomicrmw.start +; SI: .LBB35_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v2 @@ -1831,7 +1831,7 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: .LBB35_1: ; %atomicrmw.start +; VI: .LBB35_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 @@ -1856,7 +1856,7 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9: .LBB35_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 @@ -1895,7 +1895,7 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: .LBB36_1: ; %atomicrmw.start +; SI: .LBB36_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v8, v1 @@ -1934,7 +1934,7 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI: .LBB36_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v8, v1 @@ -1959,7 +1959,7 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9: .LBB36_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v1 @@ -1997,7 +1997,7 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: .LBB37_1: ; %atomicrmw.start +; SI: .LBB37_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v8, v1 @@ -2036,7 +2036,7 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI: .LBB37_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v8, v1 @@ -2061,7 +2061,7 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9: .LBB37_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v1 @@ -2171,7 +2171,7 @@ define void @global_atomic_and_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB40_1: ; %atomicrmw.start +; SI: .LBB40_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, v7, v3 @@ -2200,7 +2200,7 @@ define void @global_atomic_and_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB40_1: ; %atomicrmw.start +; VI: .LBB40_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v5, v7, v3 @@ -2223,7 +2223,7 @@ define void @global_atomic_and_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX9: .LBB40_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 @@ -2254,7 +2254,7 @@ define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB41_1: ; %atomicrmw.start +; SI: .LBB41_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, v7, v3 @@ -2285,7 +2285,7 @@ define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB41_1: ; %atomicrmw.start +; VI: .LBB41_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v5, v7, v3 @@ -2308,7 +2308,7 @@ define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9: .LBB41_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 @@ -2344,7 +2344,7 @@ define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB42_1: ; %atomicrmw.start +; SI: .LBB42_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -2373,7 +2373,7 @@ define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB42_1: ; %atomicrmw.start +; VI: .LBB42_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -2398,7 +2398,7 @@ define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9: .LBB42_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -2435,7 +2435,7 @@ define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB43_1: ; %atomicrmw.start +; SI: .LBB43_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -2466,7 +2466,7 @@ define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB43_1: ; %atomicrmw.start +; VI: .LBB43_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -2489,7 +2489,7 @@ define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9: .LBB43_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -2529,7 +2529,7 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB44_1: ; %atomicrmw.start +; SI: .LBB44_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, s34, v3 @@ -2567,7 +2567,7 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB44_1: ; %atomicrmw.start +; VI: .LBB44_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, s7, v3 @@ -2591,7 +2591,7 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9: .LBB44_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 @@ -2628,7 +2628,7 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB45_1: ; %atomicrmw.start +; SI: .LBB45_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, s34, v3 @@ -2666,7 +2666,7 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB45_1: ; %atomicrmw.start +; VI: .LBB45_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, s7, v3 @@ -2690,7 +2690,7 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9: .LBB45_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 @@ -2728,7 +2728,7 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB46_1: ; %atomicrmw.start +; SI: .LBB46_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v1 @@ -2766,7 +2766,7 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB46_1: ; %atomicrmw.start +; VI: .LBB46_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v1 @@ -2790,7 +2790,7 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9: .LBB46_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v1 @@ -2827,7 +2827,7 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB47_1: ; %atomicrmw.start +; SI: .LBB47_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v1 @@ -2865,7 +2865,7 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB47_1: ; %atomicrmw.start +; VI: .LBB47_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v1 @@ -2889,7 +2889,7 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9: .LBB47_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v1 @@ -2999,6 +2999,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB50_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3030,6 +3031,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB50_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3055,6 +3057,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3088,6 +3091,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB51_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3121,6 +3125,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB51_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3146,6 +3151,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3184,6 +3190,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB52_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3215,6 +3222,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB52_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3242,6 +3250,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3281,6 +3290,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB53_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3314,6 +3324,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB53_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3339,6 +3350,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3381,6 +3393,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB54_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3421,6 +3434,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB54_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3447,6 +3461,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3486,6 +3501,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB55_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3526,6 +3542,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB55_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3552,6 +3569,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3592,6 +3610,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB56_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3632,6 +3651,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3658,6 +3678,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3697,6 +3718,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB57_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3737,6 +3759,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3763,6 +3786,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3797,6 +3821,7 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB58_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3830,6 +3855,7 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB58_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3855,6 +3881,7 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3893,6 +3920,7 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB59_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -3926,6 +3954,7 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB59_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3951,6 +3980,7 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3991,7 +4021,7 @@ define void @global_atomic_or_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB60_1: ; %atomicrmw.start +; SI: .LBB60_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v5, v7, v3 @@ -4020,7 +4050,7 @@ define void @global_atomic_or_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB60_1: ; %atomicrmw.start +; VI: .LBB60_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v5, v7, v3 @@ -4043,7 +4073,7 @@ define void @global_atomic_or_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX9: .LBB60_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 @@ -4074,7 +4104,7 @@ define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB61_1: ; %atomicrmw.start +; SI: .LBB61_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v5, v7, v3 @@ -4105,7 +4135,7 @@ define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB61_1: ; %atomicrmw.start +; VI: .LBB61_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v5, v7, v3 @@ -4128,7 +4158,7 @@ define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9: .LBB61_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 @@ -4164,7 +4194,7 @@ define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB62_1: ; %atomicrmw.start +; SI: .LBB62_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -4193,7 +4223,7 @@ define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB62_1: ; %atomicrmw.start +; VI: .LBB62_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -4218,7 +4248,7 @@ define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9: .LBB62_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -4255,7 +4285,7 @@ define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB63_1: ; %atomicrmw.start +; SI: .LBB63_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -4286,7 +4316,7 @@ define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB63_1: ; %atomicrmw.start +; VI: .LBB63_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -4309,7 +4339,7 @@ define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9: .LBB63_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -4349,7 +4379,7 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB64_1: ; %atomicrmw.start +; SI: .LBB64_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, s34, v3 @@ -4387,7 +4417,7 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB64_1: ; %atomicrmw.start +; VI: .LBB64_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v1, s7, v3 @@ -4411,7 +4441,7 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX9: .LBB64_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 @@ -4448,7 +4478,7 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB65_1: ; %atomicrmw.start +; SI: .LBB65_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, s34, v3 @@ -4486,7 +4516,7 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB65_1: ; %atomicrmw.start +; VI: .LBB65_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v1, s7, v3 @@ -4510,7 +4540,7 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX9: .LBB65_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 @@ -4548,7 +4578,7 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB66_1: ; %atomicrmw.start +; SI: .LBB66_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v1 @@ -4586,7 +4616,7 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB66_1: ; %atomicrmw.start +; VI: .LBB66_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v1 @@ -4610,7 +4640,7 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX9: .LBB66_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v1 @@ -4647,7 +4677,7 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB67_1: ; %atomicrmw.start +; SI: .LBB67_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v1 @@ -4685,7 +4715,7 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB67_1: ; %atomicrmw.start +; VI: .LBB67_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v1 @@ -4709,7 +4739,7 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9: .LBB67_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v1 @@ -4819,7 +4849,7 @@ define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB70_1: ; %atomicrmw.start +; SI: .LBB70_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -4848,7 +4878,7 @@ define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB70_1: ; %atomicrmw.start +; VI: .LBB70_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -4871,7 +4901,7 @@ define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX9: .LBB70_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -4902,7 +4932,7 @@ define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB71_1: ; %atomicrmw.start +; SI: .LBB71_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -4933,7 +4963,7 @@ define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB71_1: ; %atomicrmw.start +; VI: .LBB71_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -4956,7 +4986,7 @@ define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX9: .LBB71_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 @@ -4992,7 +5022,7 @@ define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB72_1: ; %atomicrmw.start +; SI: .LBB72_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -5021,7 +5051,7 @@ define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB72_1: ; %atomicrmw.start +; VI: .LBB72_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -5046,7 +5076,7 @@ define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9: .LBB72_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -5083,7 +5113,7 @@ define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB73_1: ; %atomicrmw.start +; SI: .LBB73_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -5114,7 +5144,7 @@ define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB73_1: ; %atomicrmw.start +; VI: .LBB73_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -5137,7 +5167,7 @@ define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9: .LBB73_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -5177,7 +5207,7 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB74_1: ; %atomicrmw.start +; SI: .LBB74_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v1, s34, v3 @@ -5215,7 +5245,7 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB74_1: ; %atomicrmw.start +; VI: .LBB74_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -5239,7 +5269,7 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9: .LBB74_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -5276,7 +5306,7 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB75_1: ; %atomicrmw.start +; SI: .LBB75_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v1, s34, v3 @@ -5314,7 +5344,7 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB75_1: ; %atomicrmw.start +; VI: .LBB75_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -5338,7 +5368,7 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9: .LBB75_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 @@ -5376,7 +5406,7 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB76_1: ; %atomicrmw.start +; SI: .LBB76_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v1 @@ -5414,7 +5444,7 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB76_1: ; %atomicrmw.start +; VI: .LBB76_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v1 @@ -5438,7 +5468,7 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9: .LBB76_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v1 @@ -5475,7 +5505,7 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB77_1: ; %atomicrmw.start +; SI: .LBB77_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v1 @@ -5513,7 +5543,7 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB77_1: ; %atomicrmw.start +; VI: .LBB77_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v1 @@ -5537,7 +5567,7 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9: .LBB77_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v1 @@ -5647,6 +5677,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB80_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5677,6 +5708,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB80_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5701,6 +5733,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5733,6 +5766,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB81_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5765,6 +5799,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB81_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5789,6 +5824,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5826,6 +5862,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB82_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5856,6 +5893,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB82_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5882,6 +5920,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5920,6 +5959,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB83_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5952,6 +5992,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB83_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5976,6 +6017,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6019,6 +6061,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB84_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6060,6 +6103,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB84_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6087,6 +6131,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6127,6 +6172,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB85_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6168,6 +6214,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: .p2align ; VI-NEXT: .LBB85_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6195,6 +6242,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6236,6 +6284,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB86_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6277,6 +6326,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB86_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6304,6 +6354,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6344,6 +6395,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB87_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6385,6 +6437,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: .p2align ; VI-NEXT: .LBB87_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6412,6 +6465,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6453,6 +6507,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB88_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -6494,6 +6549,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -6527,6 +6583,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -6567,6 +6624,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB89_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -6613,6 +6671,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: .p2align ; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 @@ -6649,6 +6708,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -6694,6 +6754,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB90_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -6733,6 +6794,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB90_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -6766,6 +6828,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -6805,6 +6868,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -6849,6 +6913,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: .p2align ; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 @@ -6885,6 +6950,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -6999,6 +7065,7 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB94_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7029,6 +7096,7 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB94_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7053,6 +7121,7 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7085,6 +7154,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB95_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7117,6 +7187,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB95_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7141,6 +7212,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7178,6 +7250,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB96_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7208,6 +7281,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB96_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7234,6 +7308,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7272,6 +7347,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB97_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7304,6 +7380,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB97_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7328,6 +7405,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7371,6 +7449,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB98_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7412,6 +7491,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB98_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7439,6 +7519,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7479,6 +7560,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB99_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7520,6 +7602,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: .p2align ; VI-NEXT: .LBB99_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7547,6 +7630,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7588,6 +7672,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB100_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7629,6 +7714,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB100_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7656,6 +7742,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7696,6 +7783,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB101_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -7737,6 +7825,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: .p2align ; VI-NEXT: .LBB101_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7764,6 +7853,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7805,6 +7895,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -7846,6 +7937,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -7879,6 +7971,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -7919,6 +8012,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB103_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -7965,6 +8059,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: .p2align ; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 @@ -8001,6 +8096,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -8045,6 +8141,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB104_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -8089,6 +8186,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: .p2align ; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 @@ -8125,6 +8223,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -8239,6 +8338,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB107_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8269,6 +8369,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB107_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8293,6 +8394,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8325,6 +8427,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB108_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8357,6 +8460,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB108_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8381,6 +8485,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8418,6 +8523,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB109_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8448,6 +8554,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB109_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8474,6 +8581,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8512,6 +8620,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB110_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8544,6 +8653,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB110_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8568,6 +8678,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8611,6 +8722,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB111_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8652,6 +8764,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB111_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8679,6 +8792,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8719,6 +8833,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB112_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8760,6 +8875,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: .p2align ; VI-NEXT: .LBB112_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8787,6 +8903,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8828,6 +8945,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB113_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8869,6 +8987,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB113_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -8896,6 +9015,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -8936,6 +9056,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB114_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -8977,6 +9098,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: .p2align ; VI-NEXT: .LBB114_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -9004,6 +9126,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -9115,6 +9238,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB117_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -9145,6 +9269,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB117_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -9169,6 +9294,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -9201,6 +9327,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB118_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -9233,6 +9360,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB118_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -9257,6 +9385,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -9294,6 +9423,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB119_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -9324,6 +9454,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB119_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -9350,6 +9481,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -9388,6 +9520,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB120_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -9420,6 +9553,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB120_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -9444,6 +9578,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -9487,6 +9622,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB121_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -9528,6 +9664,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB121_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -9555,6 +9692,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -9595,6 +9733,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB122_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -9636,6 +9775,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: .p2align ; VI-NEXT: .LBB122_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -9663,6 +9803,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -9704,6 +9845,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB123_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -9745,6 +9887,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: .p2align ; VI-NEXT: .LBB123_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -9772,6 +9915,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -9812,6 +9956,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .p2align ; SI-NEXT: .LBB124_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -9853,6 +9998,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: .p2align ; VI-NEXT: .LBB124_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -9880,6 +10026,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -9921,6 +10068,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB125_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -9962,6 +10110,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: .p2align ; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -9995,6 +10144,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -10035,6 +10185,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB126_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -10081,6 +10232,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: .p2align ; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 @@ -10117,6 +10269,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -10160,6 +10313,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB127_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -10195,6 +10349,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: .p2align ; VI-NEXT: .LBB127_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -10224,6 +10379,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -10262,6 +10418,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB128_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -10306,6 +10463,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: .p2align ; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 @@ -10342,6 +10500,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -10456,7 +10615,7 @@ define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB131_1: ; %atomicrmw.start +; SI: .LBB131_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6 @@ -10488,7 +10647,7 @@ define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB131_1: ; %atomicrmw.start +; VI: .LBB131_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 @@ -10514,7 +10673,7 @@ define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX9: .LBB131_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 @@ -10548,7 +10707,7 @@ define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB132_1: ; %atomicrmw.start +; SI: .LBB132_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6 @@ -10582,7 +10741,7 @@ define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB132_1: ; %atomicrmw.start +; VI: .LBB132_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 @@ -10608,7 +10767,7 @@ define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX9: .LBB132_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 @@ -10647,7 +10806,7 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB133_1: ; %atomicrmw.start +; SI: .LBB133_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -10679,7 +10838,7 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB133_1: ; %atomicrmw.start +; VI: .LBB133_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -10707,7 +10866,7 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX9: .LBB133_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -10747,7 +10906,7 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB134_1: ; %atomicrmw.start +; SI: .LBB134_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -10781,7 +10940,7 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB134_1: ; %atomicrmw.start +; VI: .LBB134_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -10807,7 +10966,7 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX9: .LBB134_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -10850,7 +11009,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB135_1: ; %atomicrmw.start +; SI: .LBB135_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 @@ -10891,7 +11050,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB135_1: ; %atomicrmw.start +; VI: .LBB135_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 @@ -10918,7 +11077,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX9: .LBB135_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 @@ -10958,7 +11117,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB136_1: ; %atomicrmw.start +; SI: .LBB136_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 @@ -10999,7 +11158,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr ; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB136_1: ; %atomicrmw.start +; VI: .LBB136_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 @@ -11026,7 +11185,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX9: .LBB136_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 @@ -11067,7 +11226,7 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB137_1: ; %atomicrmw.start +; SI: .LBB137_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v1 @@ -11108,7 +11267,7 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB137_1: ; %atomicrmw.start +; VI: .LBB137_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v1 @@ -11135,7 +11294,7 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX9: .LBB137_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v1 @@ -11175,7 +11334,7 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB138_1: ; %atomicrmw.start +; SI: .LBB138_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v1 @@ -11216,7 +11375,7 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB138_1: ; %atomicrmw.start +; VI: .LBB138_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v1 @@ -11243,7 +11402,7 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX9: .LBB138_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v1 @@ -11356,7 +11515,7 @@ define void @global_atomic_udec_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s9, s10 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_mov_b64 s[6:7], 0 -; SI-NEXT: .LBB141_1: ; %atomicrmw.start +; SI: .LBB141_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6 @@ -11390,7 +11549,7 @@ define void @global_atomic_udec_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[8:9], 0 -; VI-NEXT: .LBB141_1: ; %atomicrmw.start +; VI: .LBB141_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -11418,7 +11577,7 @@ define void @global_atomic_udec_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX9: .LBB141_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -11454,7 +11613,7 @@ define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; SI-NEXT: s_mov_b32 s9, s10 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[6:7], 0 -; SI-NEXT: .LBB142_1: ; %atomicrmw.start +; SI: .LBB142_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6 @@ -11490,7 +11649,7 @@ define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[8:9], 0 -; VI-NEXT: .LBB142_1: ; %atomicrmw.start +; VI: .LBB142_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -11518,7 +11677,7 @@ define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX9: .LBB142_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] @@ -11559,7 +11718,7 @@ define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s9, s10 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 ; SI-NEXT: s_mov_b64 s[6:7], 0 -; SI-NEXT: .LBB143_1: ; %atomicrmw.start +; SI: .LBB143_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -11593,7 +11752,7 @@ define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[8:9], 0 -; VI-NEXT: .LBB143_1: ; %atomicrmw.start +; VI: .LBB143_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -11623,7 +11782,7 @@ define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX9: .LBB143_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -11665,7 +11824,7 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; SI-NEXT: s_mov_b32 s9, s10 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[6:7], 0 -; SI-NEXT: .LBB144_1: ; %atomicrmw.start +; SI: .LBB144_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -11701,7 +11860,7 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[8:9], 0 -; VI-NEXT: .LBB144_1: ; %atomicrmw.start +; VI: .LBB144_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -11729,7 +11888,7 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX9: .LBB144_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -11776,7 +11935,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; SI-NEXT: s_mov_b64 s[38:39], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB145_1: ; %atomicrmw.start +; SI: .LBB145_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v2 @@ -11821,7 +11980,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB145_1: ; %atomicrmw.start +; VI: .LBB145_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -11852,7 +12011,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX9: .LBB145_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -11896,7 +12055,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; SI-NEXT: s_mov_b64 s[38:39], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB146_1: ; %atomicrmw.start +; SI: .LBB146_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v2 @@ -11941,7 +12100,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; VI-NEXT: s_mov_b64 s[38:39], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: .LBB146_1: ; %atomicrmw.start +; VI: .LBB146_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -11972,7 +12131,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX9: .LBB146_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] @@ -12017,7 +12176,7 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i ; SI-NEXT: s_mov_b64 s[38:39], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB147_1: ; %atomicrmw.start +; SI: .LBB147_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -12062,7 +12221,7 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB147_1: ; %atomicrmw.start +; VI: .LBB147_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -12093,7 +12252,7 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i ; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX9: .LBB147_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -12137,7 +12296,7 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa ; SI-NEXT: s_mov_b64 s[38:39], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB148_1: ; %atomicrmw.start +; SI: .LBB148_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -12182,7 +12341,7 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa ; VI-NEXT: s_mov_b64 s[38:39], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: .LBB148_1: ; %atomicrmw.start +; VI: .LBB148_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -12213,7 +12372,7 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa ; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX9: .LBB148_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 37756d15861be..028e5e1c91b75 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -36,7 +36,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -71,7 +71,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -103,7 +103,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1064: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -134,7 +134,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1032: .LBB0_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -207,7 +207,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -242,7 +242,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9-DPP: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -274,7 +274,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -305,7 +305,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB0_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -394,7 +394,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop +; GFX7LESS: .LBB1_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 @@ -418,7 +418,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX7LESS: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 @@ -466,7 +466,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB1_1: ; %ComputeLoop +; GFX9: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -488,7 +488,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX9: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -533,7 +533,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1064: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 @@ -555,7 +555,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1064: .LBB1_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -600,7 +600,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1032: .LBB1_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 @@ -621,7 +621,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1032: .LBB1_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -656,7 +656,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1164: .LBB1_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -704,7 +704,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1132: .LBB1_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -764,7 +764,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB1_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX9-DPP: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 @@ -926,7 +926,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB1_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -997,7 +997,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB1_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -1165,7 +1165,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -1210,7 +1210,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9: .LBB2_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -1250,7 +1250,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1064: .LBB2_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -1289,7 +1289,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1032: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -1385,7 +1385,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -1430,7 +1430,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9-DPP: .LBB2_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -1470,7 +1470,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB2_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -1509,7 +1509,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -1612,7 +1612,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS: .LBB3_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 @@ -1636,7 +1636,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX7LESS: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 @@ -1684,7 +1684,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -1706,7 +1706,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX9: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -1751,7 +1751,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 @@ -1773,7 +1773,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1064: .LBB3_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -1818,7 +1818,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032: .LBB3_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 @@ -1839,7 +1839,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1032: .LBB3_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -1874,7 +1874,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164: .LBB3_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -1922,7 +1922,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132: .LBB3_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -1982,7 +1982,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB3_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 @@ -2067,7 +2067,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX9-DPP: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 @@ -2144,7 +2144,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB3_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2215,7 +2215,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB3_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2383,7 +2383,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -2428,7 +2428,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9: .LBB4_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -2468,7 +2468,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1064: .LBB4_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -2507,7 +2507,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1032: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -2548,7 +2548,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1164: .LBB4_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2588,7 +2588,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 -; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1132: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2633,7 +2633,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -2678,7 +2678,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9-DPP: .LBB4_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -2718,7 +2718,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB4_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -2757,7 +2757,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -2798,7 +2798,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB4_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2838,7 +2838,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 -; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2890,7 +2890,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS: .LBB5_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 @@ -2914,7 +2914,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX7LESS: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2962,7 +2962,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -2984,7 +2984,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX9: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -3029,7 +3029,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 @@ -3051,7 +3051,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1064: .LBB5_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -3096,7 +3096,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032: .LBB5_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 @@ -3117,7 +3117,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1032: .LBB5_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -3152,7 +3152,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164: .LBB5_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -3200,7 +3200,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132: .LBB5_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -3260,7 +3260,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB5_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 @@ -3345,7 +3345,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX9-DPP: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 @@ -3422,7 +3422,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB5_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -3493,7 +3493,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB5_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -3664,7 +3664,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX7LESS-NEXT: .LBB6_1: ; %ComputeLoop +; GFX7LESS: .LBB6_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 @@ -3688,7 +3688,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start +; GFX7LESS: .LBB6_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 @@ -3736,7 +3736,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB6_1: ; %ComputeLoop +; GFX9: .LBB6_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -3758,7 +3758,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB6_4: ; %atomicrmw.start +; GFX9: .LBB6_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -3803,7 +3803,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1064: .LBB6_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 @@ -3825,7 +3825,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB6_4: ; %atomicrmw.start +; GFX1064: .LBB6_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -3870,7 +3870,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1032: .LBB6_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 @@ -3891,7 +3891,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB6_4: ; %atomicrmw.start +; GFX1032: .LBB6_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -3926,7 +3926,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1164: .LBB6_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -3974,7 +3974,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1132: .LBB6_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -4034,7 +4034,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB6_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 @@ -4119,7 +4119,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9-DPP: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 @@ -4196,7 +4196,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB6_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -4267,7 +4267,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB6_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -4435,7 +4435,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX7LESS: .LBB7_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -4480,7 +4480,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX9: .LBB7_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -4520,7 +4520,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1064: .LBB7_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -4559,7 +4559,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1032: .LBB7_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -4600,7 +4600,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1164: .LBB7_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 @@ -4640,7 +4640,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 -; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1132: .LBB7_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 @@ -4685,7 +4685,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB7_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -4730,7 +4730,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX9-DPP: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -4770,7 +4770,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -4809,7 +4809,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -4850,7 +4850,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -4890,7 +4890,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 -; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -4941,7 +4941,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX7LESS-NEXT: .LBB8_1: ; %ComputeLoop +; GFX7LESS: .LBB8_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 @@ -4965,7 +4965,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start +; GFX7LESS: .LBB8_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 @@ -5013,7 +5013,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB8_1: ; %ComputeLoop +; GFX9: .LBB8_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -5035,7 +5035,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB8_4: ; %atomicrmw.start +; GFX9: .LBB8_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -5080,7 +5080,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1064: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 @@ -5102,7 +5102,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB8_4: ; %atomicrmw.start +; GFX1064: .LBB8_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -5147,7 +5147,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1032: .LBB8_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 @@ -5168,7 +5168,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB8_4: ; %atomicrmw.start +; GFX1032: .LBB8_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -5203,7 +5203,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1164: .LBB8_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -5228,7 +5228,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1164-NEXT: .LBB8_4: ; %atomicrmw.start +; GFX1164: .LBB8_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 @@ -5264,7 +5264,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1132: .LBB8_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -5288,7 +5288,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1132-NEXT: .LBB8_4: ; %atomicrmw.start +; GFX1132: .LBB8_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 @@ -5337,7 +5337,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB8_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 @@ -5422,7 +5422,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9-DPP: .LBB8_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 @@ -5499,7 +5499,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB8_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -5570,7 +5570,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB8_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -5645,7 +5645,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] -; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB8_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f32_e32 v4, v5, v0 @@ -5714,7 +5714,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] -; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f32_e32 v4, v5, v0 @@ -5755,7 +5755,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX7LESS: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -5794,7 +5794,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9: .LBB9_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -5828,7 +5828,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1064: .LBB9_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -5861,7 +5861,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1032: .LBB9_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -5897,7 +5897,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164: .LBB9_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -5933,7 +5933,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132: .LBB9_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -5969,7 +5969,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -6008,7 +6008,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9-DPP: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -6042,7 +6042,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -6075,7 +6075,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB9_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -6111,7 +6111,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -6147,7 +6147,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -6199,7 +6199,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7LESS-NEXT: .LBB10_1: ; %ComputeLoop +; GFX7LESS: .LBB10_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 @@ -6224,7 +6224,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start +; GFX7LESS: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -6276,7 +6276,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-NEXT: .LBB10_1: ; %ComputeLoop +; GFX9: .LBB10_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -6299,7 +6299,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start +; GFX9: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -6346,7 +6346,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1064: .LBB10_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 @@ -6369,7 +6369,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start +; GFX1064: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -6416,7 +6416,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1032: .LBB10_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 @@ -6438,7 +6438,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start +; GFX1032: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -6475,7 +6475,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1164: .LBB10_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -6502,7 +6502,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start +; GFX1164: .LBB10_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -6540,7 +6540,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1132: .LBB10_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -6565,7 +6565,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start +; GFX1132: .LBB10_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -6614,7 +6614,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] @@ -6718,7 +6718,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9-DPP: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] @@ -6807,7 +6807,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] @@ -6890,7 +6890,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] @@ -6979,7 +6979,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] @@ -7061,7 +7061,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] @@ -7111,7 +7111,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX7LESS: .LBB11_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -7159,7 +7159,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9: .LBB11_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -7200,7 +7200,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1064: .LBB11_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -7240,7 +7240,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1032: .LBB11_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -7282,7 +7282,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164: .LBB11_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7323,7 +7323,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132: .LBB11_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7368,7 +7368,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB11_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -7416,7 +7416,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9-DPP: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -7457,7 +7457,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -7497,7 +7497,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -7539,7 +7539,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7580,7 +7580,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7632,7 +7632,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7LESS-NEXT: .LBB12_1: ; %ComputeLoop +; GFX7LESS: .LBB12_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 @@ -7657,7 +7657,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start +; GFX7LESS: .LBB12_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7709,7 +7709,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-NEXT: .LBB12_1: ; %ComputeLoop +; GFX9: .LBB12_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -7732,7 +7732,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start +; GFX9: .LBB12_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7779,7 +7779,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1064: .LBB12_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 @@ -7802,7 +7802,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start +; GFX1064: .LBB12_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7849,7 +7849,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1032: .LBB12_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 @@ -7871,7 +7871,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start +; GFX1032: .LBB12_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7908,7 +7908,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1164: .LBB12_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -7935,7 +7935,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start +; GFX1164: .LBB12_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7973,7 +7973,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1132: .LBB12_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -7998,7 +7998,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start +; GFX1132: .LBB12_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -8047,7 +8047,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB12_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] @@ -8151,7 +8151,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-DPP: .LBB12_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] @@ -8240,7 +8240,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB12_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] @@ -8323,7 +8323,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] @@ -8412,7 +8412,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB12_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] @@ -8494,7 +8494,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] @@ -8544,7 +8544,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -8592,7 +8592,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9: .LBB13_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -8633,7 +8633,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064: .LBB13_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -8673,7 +8673,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -8715,7 +8715,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164: .LBB13_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -8756,7 +8756,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -8801,7 +8801,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -8849,7 +8849,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-DPP: .LBB13_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -8890,7 +8890,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB13_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -8930,7 +8930,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -8972,7 +8972,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB13_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -9013,7 +9013,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -9065,7 +9065,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7LESS-NEXT: .LBB14_1: ; %ComputeLoop +; GFX7LESS: .LBB14_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 @@ -9090,7 +9090,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX7LESS: .LBB14_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -9142,7 +9142,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-NEXT: .LBB14_1: ; %ComputeLoop +; GFX9: .LBB14_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -9165,7 +9165,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX9: .LBB14_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -9212,7 +9212,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1064: .LBB14_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 @@ -9235,7 +9235,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX1064: .LBB14_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -9282,7 +9282,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1032: .LBB14_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 @@ -9304,7 +9304,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX1032: .LBB14_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -9341,7 +9341,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1164: .LBB14_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -9368,7 +9368,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX1164: .LBB14_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -9406,7 +9406,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1132: .LBB14_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -9431,7 +9431,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX1132: .LBB14_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -9480,7 +9480,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB14_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] @@ -9584,7 +9584,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start +; GFX9-DPP: .LBB14_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] @@ -9673,7 +9673,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB14_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] @@ -9756,7 +9756,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB14_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] @@ -9845,7 +9845,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB14_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] @@ -9927,7 +9927,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB14_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] @@ -9980,7 +9980,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7LESS-NEXT: .LBB15_1: ; %ComputeLoop +; GFX7LESS: .LBB15_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 @@ -10005,7 +10005,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX7LESS: .LBB15_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -10057,7 +10057,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-NEXT: .LBB15_1: ; %ComputeLoop +; GFX9: .LBB15_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -10080,7 +10080,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX9: .LBB15_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -10127,7 +10127,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1064: .LBB15_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 @@ -10150,7 +10150,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX1064: .LBB15_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -10197,7 +10197,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1032: .LBB15_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 @@ -10219,7 +10219,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX1032: .LBB15_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -10256,7 +10256,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1164: .LBB15_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -10283,7 +10283,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX1164: .LBB15_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -10321,7 +10321,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1132: .LBB15_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -10346,7 +10346,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX1132: .LBB15_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -10395,7 +10395,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB15_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] @@ -10499,7 +10499,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX9-DPP: .LBB15_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] @@ -10588,7 +10588,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB15_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] @@ -10671,7 +10671,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB15_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] @@ -10760,7 +10760,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB15_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] @@ -10842,7 +10842,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB15_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] @@ -10892,7 +10892,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7LESS: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -10940,7 +10940,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX9: .LBB16_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -10981,7 +10981,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1064: .LBB16_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -11021,7 +11021,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1032: .LBB16_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -11063,7 +11063,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1164: .LBB16_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -11104,7 +11104,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1132: .LBB16_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -11149,7 +11149,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -11197,7 +11197,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX9-DPP: .LBB16_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -11238,7 +11238,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB16_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -11278,7 +11278,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB16_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -11320,7 +11320,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB16_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -11361,7 +11361,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB16_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -11413,7 +11413,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7LESS-NEXT: .LBB17_1: ; %ComputeLoop +; GFX7LESS: .LBB17_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 @@ -11438,7 +11438,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX7LESS: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -11490,7 +11490,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-NEXT: .LBB17_1: ; %ComputeLoop +; GFX9: .LBB17_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -11513,7 +11513,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX9: .LBB17_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -11560,7 +11560,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1064: .LBB17_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 @@ -11583,7 +11583,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX1064: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -11630,7 +11630,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1032: .LBB17_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 @@ -11652,7 +11652,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX1032: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -11689,7 +11689,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1164: .LBB17_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -11716,7 +11716,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX1164: .LBB17_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -11754,7 +11754,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1132: .LBB17_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -11779,7 +11779,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX1132: .LBB17_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -11828,7 +11828,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] @@ -11932,7 +11932,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start +; GFX9-DPP: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] @@ -12021,7 +12021,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] @@ -12104,7 +12104,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] @@ -12193,7 +12193,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB17_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] @@ -12275,7 +12275,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB17_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[0:1] @@ -12315,7 +12315,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX7LESS: .LBB18_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -12350,7 +12350,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX9: .LBB18_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -12382,7 +12382,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1064: .LBB18_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -12413,7 +12413,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1032: .LBB18_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -12486,7 +12486,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB18_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -12521,7 +12521,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX9-DPP: .LBB18_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -12553,7 +12553,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB18_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -12584,7 +12584,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB18_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -12661,7 +12661,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX7LESS: .LBB19_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -12696,7 +12696,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX9: .LBB19_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -12728,7 +12728,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1064: .LBB19_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -12759,7 +12759,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1032: .LBB19_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -12832,7 +12832,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB19_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -12867,7 +12867,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX9-DPP: .LBB19_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -12899,7 +12899,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB19_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -12930,7 +12930,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB19_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 6351bb39e97f5..a902ec594200f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -32,7 +32,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -64,7 +64,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -157,7 +157,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -189,7 +189,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9-DPP: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -302,7 +302,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop +; GFX7LESS: .LBB1_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 @@ -329,7 +329,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX7LESS: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -378,7 +378,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: .LBB1_1: ; %ComputeLoop +; GFX9: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -403,7 +403,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX9: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 @@ -449,7 +449,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1064: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 @@ -506,7 +506,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1032: .LBB1_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 @@ -552,7 +552,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1164: .LBB1_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -603,7 +603,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1132: .LBB1_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -667,7 +667,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB1_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -761,7 +761,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX9-DPP: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1070,7 +1070,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -1102,7 +1102,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9: .LBB2_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -1195,7 +1195,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -1227,7 +1227,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9-DPP: .LBB2_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -1341,7 +1341,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS: .LBB3_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 @@ -1368,7 +1368,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX7LESS: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -1417,7 +1417,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -1442,7 +1442,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX9: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1488,7 +1488,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 @@ -1545,7 +1545,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032: .LBB3_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 @@ -1591,7 +1591,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164: .LBB3_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -1642,7 +1642,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132: .LBB3_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -1706,7 +1706,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB3_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -1800,7 +1800,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX9-DPP: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2110,7 +2110,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -2142,7 +2142,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9: .LBB4_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -2235,7 +2235,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -2267,7 +2267,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9-DPP: .LBB4_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -2380,7 +2380,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS: .LBB5_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 @@ -2407,7 +2407,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX7LESS: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -2456,7 +2456,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -2481,7 +2481,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX9: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2527,7 +2527,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 @@ -2584,7 +2584,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032: .LBB5_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 @@ -2630,7 +2630,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164: .LBB5_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -2681,7 +2681,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132: .LBB5_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -2745,7 +2745,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB5_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -2839,7 +2839,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX9-DPP: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -3149,7 +3149,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX7LESS: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 @@ -3185,7 +3185,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9: .LBB6_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 @@ -3250,7 +3250,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164: .LBB6_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3282,7 +3282,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132: .LBB6_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3315,7 +3315,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 @@ -3351,7 +3351,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9-DPP: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 @@ -3416,7 +3416,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB6_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3448,7 +3448,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB6_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3501,7 +3501,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop +; GFX7LESS: .LBB7_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3529,7 +3529,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start +; GFX7LESS: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3582,7 +3582,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -3608,7 +3608,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start +; GFX9: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3656,7 +3656,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1064: .LBB7_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] @@ -3715,7 +3715,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1032: .LBB7_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] @@ -3763,7 +3763,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1164: .LBB7_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -3793,7 +3793,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start +; GFX1164: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3833,7 +3833,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1132: .LBB7_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -3862,7 +3862,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start +; GFX1132: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3914,7 +3914,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4026,7 +4026,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX9-DPP: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) @@ -4288,7 +4288,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] @@ -4378,7 +4378,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] @@ -4417,7 +4417,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX7LESS: .LBB8_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 @@ -4453,7 +4453,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9: .LBB8_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 @@ -4518,7 +4518,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164: .LBB8_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4550,7 +4550,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132: .LBB8_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4583,7 +4583,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB8_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 @@ -4619,7 +4619,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9-DPP: .LBB8_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 @@ -4684,7 +4684,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB8_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4716,7 +4716,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4769,7 +4769,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop +; GFX7LESS: .LBB9_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -4797,7 +4797,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start +; GFX7LESS: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4850,7 +4850,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: .LBB9_1: ; %ComputeLoop +; GFX9: .LBB9_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -4876,7 +4876,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start +; GFX9: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4924,7 +4924,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1064: .LBB9_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] @@ -4983,7 +4983,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1032: .LBB9_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] @@ -5031,7 +5031,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1164: .LBB9_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -5061,7 +5061,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start +; GFX1164: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5101,7 +5101,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1132: .LBB9_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -5130,7 +5130,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start +; GFX1132: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5182,7 +5182,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB9_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5294,7 +5294,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9-DPP: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) @@ -5556,7 +5556,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] @@ -5646,7 +5646,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] @@ -5685,7 +5685,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX7LESS: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 @@ -5721,7 +5721,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9: .LBB10_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 @@ -5786,7 +5786,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164: .LBB10_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5818,7 +5818,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132: .LBB10_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5851,7 +5851,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 @@ -5887,7 +5887,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9-DPP: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 @@ -5952,7 +5952,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5984,7 +5984,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6037,7 +6037,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop +; GFX7LESS: .LBB11_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -6065,7 +6065,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start +; GFX7LESS: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6118,7 +6118,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9: .LBB11_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -6144,7 +6144,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start +; GFX9: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6192,7 +6192,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064: .LBB11_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] @@ -6251,7 +6251,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032: .LBB11_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] @@ -6299,7 +6299,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164: .LBB11_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -6329,7 +6329,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start +; GFX1164: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6369,7 +6369,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132: .LBB11_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -6398,7 +6398,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start +; GFX1132: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6450,7 +6450,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6562,7 +6562,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9-DPP: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) @@ -6824,7 +6824,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] @@ -6914,7 +6914,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] @@ -6952,7 +6952,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS: .LBB12_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -6984,7 +6984,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9: .LBB12_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -7077,7 +7077,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB12_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -7109,7 +7109,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-DPP: .LBB12_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -7206,7 +7206,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -7238,7 +7238,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9: .LBB13_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -7331,7 +7331,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 @@ -7363,7 +7363,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-DPP: .LBB13_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index a9ac00863cd17..95d803eb2cd15 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -32,7 +32,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -64,7 +64,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -157,7 +157,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -189,7 +189,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9-DPP: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -302,7 +302,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop +; GFX7LESS: .LBB1_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 @@ -329,7 +329,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX7LESS: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -378,7 +378,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: .LBB1_1: ; %ComputeLoop +; GFX9: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -403,7 +403,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX9: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 @@ -449,7 +449,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1064: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 @@ -506,7 +506,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1032: .LBB1_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 @@ -552,7 +552,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1164: .LBB1_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -603,7 +603,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1132: .LBB1_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -667,7 +667,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB1_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -761,7 +761,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX9-DPP: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1070,7 +1070,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -1102,7 +1102,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9: .LBB2_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -1195,7 +1195,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -1227,7 +1227,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9-DPP: .LBB2_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -1341,7 +1341,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS: .LBB3_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 @@ -1368,7 +1368,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX7LESS: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -1417,7 +1417,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -1442,7 +1442,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX9: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1488,7 +1488,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 @@ -1545,7 +1545,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032: .LBB3_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 @@ -1591,7 +1591,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164: .LBB3_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -1642,7 +1642,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132: .LBB3_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -1706,7 +1706,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB3_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -1800,7 +1800,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX9-DPP: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2110,7 +2110,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -2142,7 +2142,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9: .LBB4_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -2235,7 +2235,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -2267,7 +2267,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9-DPP: .LBB4_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -2380,7 +2380,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS: .LBB5_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 @@ -2407,7 +2407,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX7LESS: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -2456,7 +2456,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -2481,7 +2481,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX9: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2527,7 +2527,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 @@ -2584,7 +2584,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032: .LBB5_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 @@ -2630,7 +2630,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164: .LBB5_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -2681,7 +2681,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132: .LBB5_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -2745,7 +2745,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB5_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -2839,7 +2839,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX9-DPP: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -3149,7 +3149,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX7LESS: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 @@ -3185,7 +3185,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9: .LBB6_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 @@ -3250,7 +3250,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164: .LBB6_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3282,7 +3282,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132: .LBB6_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3315,7 +3315,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 @@ -3351,7 +3351,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9-DPP: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 @@ -3416,7 +3416,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB6_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3448,7 +3448,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB6_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3501,7 +3501,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop +; GFX7LESS: .LBB7_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3529,7 +3529,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start +; GFX7LESS: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3582,7 +3582,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -3608,7 +3608,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start +; GFX9: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3656,7 +3656,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1064: .LBB7_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] @@ -3715,7 +3715,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1032: .LBB7_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] @@ -3763,7 +3763,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1164: .LBB7_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -3793,7 +3793,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start +; GFX1164: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3833,7 +3833,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1132: .LBB7_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -3862,7 +3862,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start +; GFX1132: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -3914,7 +3914,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4026,7 +4026,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX9-DPP: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) @@ -4288,7 +4288,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] @@ -4378,7 +4378,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] @@ -4417,7 +4417,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX7LESS: .LBB8_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 @@ -4453,7 +4453,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9: .LBB8_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 @@ -4518,7 +4518,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164: .LBB8_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4550,7 +4550,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132: .LBB8_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4583,7 +4583,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB8_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 @@ -4619,7 +4619,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9-DPP: .LBB8_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 @@ -4684,7 +4684,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB8_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4716,7 +4716,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4769,7 +4769,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop +; GFX7LESS: .LBB9_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -4797,7 +4797,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start +; GFX7LESS: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4850,7 +4850,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: .LBB9_1: ; %ComputeLoop +; GFX9: .LBB9_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -4876,7 +4876,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start +; GFX9: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4924,7 +4924,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1064: .LBB9_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] @@ -4983,7 +4983,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1032: .LBB9_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] @@ -5031,7 +5031,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1164: .LBB9_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -5061,7 +5061,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start +; GFX1164: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5101,7 +5101,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1132: .LBB9_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -5130,7 +5130,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start +; GFX1132: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5182,7 +5182,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB9_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5294,7 +5294,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9-DPP: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) @@ -5556,7 +5556,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] @@ -5646,7 +5646,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] @@ -5685,7 +5685,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX7LESS: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 @@ -5721,7 +5721,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9: .LBB10_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 @@ -5786,7 +5786,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164: .LBB10_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5818,7 +5818,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132: .LBB10_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5851,7 +5851,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 @@ -5887,7 +5887,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9-DPP: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 @@ -5952,7 +5952,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5984,7 +5984,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6037,7 +6037,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop +; GFX7LESS: .LBB11_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -6065,7 +6065,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start +; GFX7LESS: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6118,7 +6118,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9: .LBB11_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -6144,7 +6144,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start +; GFX9: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6192,7 +6192,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064: .LBB11_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] @@ -6251,7 +6251,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032: .LBB11_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] @@ -6299,7 +6299,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164: .LBB11_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -6329,7 +6329,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start +; GFX1164: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6369,7 +6369,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132: .LBB11_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -6398,7 +6398,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start +; GFX1132: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6450,7 +6450,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6562,7 +6562,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9-DPP: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) @@ -6824,7 +6824,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] @@ -6914,7 +6914,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] @@ -6952,7 +6952,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS: .LBB12_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -6984,7 +6984,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9: .LBB12_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -7077,7 +7077,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB12_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -7109,7 +7109,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-DPP: .LBB12_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -7206,7 +7206,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -7238,7 +7238,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9: .LBB13_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -7331,7 +7331,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 @@ -7363,7 +7363,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-DPP: .LBB13_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 6311143f57260..b7b6cbda7ee2b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -36,7 +36,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -71,7 +71,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -103,7 +103,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1064: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -134,7 +134,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1032: .LBB0_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -168,7 +168,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1164: .LBB0_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -202,7 +202,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1132: .LBB0_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -237,7 +237,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -272,7 +272,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9-DPP: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -304,7 +304,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -335,7 +335,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB0_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -369,7 +369,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB0_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -403,7 +403,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB0_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -454,7 +454,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop +; GFX7LESS: .LBB1_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 @@ -478,7 +478,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX7LESS: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -526,7 +526,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB1_1: ; %ComputeLoop +; GFX9: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -548,7 +548,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX9: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -593,7 +593,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1064: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 @@ -615,7 +615,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1064: .LBB1_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -660,7 +660,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1032: .LBB1_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 @@ -681,7 +681,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1032: .LBB1_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -716,7 +716,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1164: .LBB1_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -741,7 +741,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1164: .LBB1_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -777,7 +777,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1132: .LBB1_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -801,7 +801,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1132: .LBB1_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -850,7 +850,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB1_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 @@ -935,7 +935,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX9-DPP: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 @@ -1012,7 +1012,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB1_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1083,7 +1083,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB1_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1158,7 +1158,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] -; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB1_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 @@ -1227,7 +1227,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] -; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB1_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 @@ -1277,7 +1277,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -1322,7 +1322,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9: .LBB2_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -1362,7 +1362,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1064: .LBB2_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1032: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -1442,7 +1442,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1164: .LBB2_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1482,7 +1482,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 -; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1132: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1527,7 +1527,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -1572,7 +1572,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9-DPP: .LBB2_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -1612,7 +1612,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB2_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -1651,7 +1651,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -1692,7 +1692,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB2_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 -; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1784,7 +1784,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS: .LBB3_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 @@ -1808,7 +1808,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX7LESS: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1856,7 +1856,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -1878,7 +1878,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX9: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1923,7 +1923,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 @@ -1945,7 +1945,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1064: .LBB3_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1990,7 +1990,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032: .LBB3_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 @@ -2011,7 +2011,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1032: .LBB3_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2046,7 +2046,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164: .LBB3_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -2071,7 +2071,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1164: .LBB3_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2107,7 +2107,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132: .LBB3_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -2131,7 +2131,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1132: .LBB3_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2180,7 +2180,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB3_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 @@ -2265,7 +2265,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX9-DPP: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 @@ -2342,7 +2342,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB3_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2413,7 +2413,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB3_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2488,7 +2488,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] -; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB3_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 @@ -2557,7 +2557,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] -; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB3_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 @@ -2607,7 +2607,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -2652,7 +2652,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9: .LBB4_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -2692,7 +2692,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1064: .LBB4_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -2731,7 +2731,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1032: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -2772,7 +2772,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1164: .LBB4_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2812,7 +2812,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 -; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1132: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2857,7 +2857,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -2902,7 +2902,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9-DPP: .LBB4_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -2942,7 +2942,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB4_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -2981,7 +2981,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -3022,7 +3022,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB4_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -3062,7 +3062,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 -; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -3114,7 +3114,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS: .LBB5_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 @@ -3138,7 +3138,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX7LESS: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -3186,7 +3186,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -3208,7 +3208,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX9: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -3253,7 +3253,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 @@ -3275,7 +3275,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1064: .LBB5_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -3320,7 +3320,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032: .LBB5_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 @@ -3341,7 +3341,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1032: .LBB5_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -3376,7 +3376,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164: .LBB5_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -3401,7 +3401,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1164: .LBB5_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -3437,7 +3437,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132: .LBB5_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -3461,7 +3461,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1132: .LBB5_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -3510,7 +3510,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB5_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 @@ -3595,7 +3595,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX9-DPP: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 @@ -3672,7 +3672,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB5_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -3743,7 +3743,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB5_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -3818,7 +3818,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] -; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB5_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 @@ -3887,7 +3887,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] -; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB5_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 @@ -3940,7 +3940,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX7LESS-NEXT: .LBB6_1: ; %ComputeLoop +; GFX7LESS: .LBB6_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 @@ -3964,7 +3964,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start +; GFX7LESS: .LBB6_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4012,7 +4012,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB6_1: ; %ComputeLoop +; GFX9: .LBB6_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -4034,7 +4034,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB6_4: ; %atomicrmw.start +; GFX9: .LBB6_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4079,7 +4079,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1064: .LBB6_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 @@ -4101,7 +4101,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB6_4: ; %atomicrmw.start +; GFX1064: .LBB6_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4146,7 +4146,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1032: .LBB6_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 @@ -4167,7 +4167,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB6_4: ; %atomicrmw.start +; GFX1032: .LBB6_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4202,7 +4202,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1164: .LBB6_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -4227,7 +4227,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1164-NEXT: .LBB6_4: ; %atomicrmw.start +; GFX1164: .LBB6_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4263,7 +4263,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1132: .LBB6_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -4287,7 +4287,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1132-NEXT: .LBB6_4: ; %atomicrmw.start +; GFX1132: .LBB6_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4336,7 +4336,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB6_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 @@ -4421,7 +4421,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9-DPP: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 @@ -4498,7 +4498,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB6_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4569,7 +4569,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB6_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4644,7 +4644,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] -; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB6_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 @@ -4713,7 +4713,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] -; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB6_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 @@ -4763,7 +4763,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX7LESS: .LBB7_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -4808,7 +4808,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX9: .LBB7_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -4848,7 +4848,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1064: .LBB7_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -4887,7 +4887,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1032: .LBB7_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -4928,7 +4928,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1164: .LBB7_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4968,7 +4968,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 -; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1132: .LBB7_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -5013,7 +5013,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB7_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -5058,7 +5058,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX9-DPP: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -5098,7 +5098,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -5137,7 +5137,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc @@ -5178,7 +5178,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -5218,7 +5218,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 -; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -5269,7 +5269,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX7LESS-NEXT: .LBB8_1: ; %ComputeLoop +; GFX7LESS: .LBB8_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 @@ -5293,7 +5293,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start +; GFX7LESS: .LBB8_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -5341,7 +5341,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB8_1: ; %ComputeLoop +; GFX9: .LBB8_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 @@ -5363,7 +5363,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] -; GFX9-NEXT: .LBB8_4: ; %atomicrmw.start +; GFX9: .LBB8_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -5408,7 +5408,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1064: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 @@ -5430,7 +5430,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1064-NEXT: .LBB8_4: ; %atomicrmw.start +; GFX1064: .LBB8_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -5475,7 +5475,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1032: .LBB8_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 @@ -5496,7 +5496,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] -; GFX1032-NEXT: .LBB8_4: ; %atomicrmw.start +; GFX1032: .LBB8_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -5531,7 +5531,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1164: .LBB8_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -5556,7 +5556,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1164-NEXT: .LBB8_4: ; %atomicrmw.start +; GFX1164: .LBB8_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -5592,7 +5592,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1132: .LBB8_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -5616,7 +5616,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] -; GFX1132-NEXT: .LBB8_4: ; %atomicrmw.start +; GFX1132: .LBB8_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -5665,7 +5665,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB8_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 @@ -5750,7 +5750,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] -; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9-DPP: .LBB8_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 @@ -5827,7 +5827,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB8_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -5898,7 +5898,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] -; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB8_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -5973,7 +5973,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] -; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB8_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 @@ -6042,7 +6042,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] -; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 @@ -6083,7 +6083,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX7LESS: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -6122,7 +6122,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9: .LBB9_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -6156,7 +6156,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1064: .LBB9_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -6189,7 +6189,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1032: .LBB9_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -6225,7 +6225,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164: .LBB9_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6261,7 +6261,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132: .LBB9_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6297,7 +6297,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -6336,7 +6336,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9-DPP: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -6370,7 +6370,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -6403,7 +6403,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB9_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -6439,7 +6439,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6475,7 +6475,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6527,7 +6527,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7LESS-NEXT: .LBB10_1: ; %ComputeLoop +; GFX7LESS: .LBB10_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 @@ -6552,7 +6552,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start +; GFX7LESS: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6604,7 +6604,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-NEXT: .LBB10_1: ; %ComputeLoop +; GFX9: .LBB10_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -6627,7 +6627,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start +; GFX9: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6674,7 +6674,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1064: .LBB10_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 @@ -6697,7 +6697,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start +; GFX1064: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6744,7 +6744,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1032: .LBB10_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 @@ -6766,7 +6766,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start +; GFX1032: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6803,7 +6803,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1164: .LBB10_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -6830,7 +6830,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start +; GFX1164: .LBB10_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6868,7 +6868,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1132: .LBB10_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -6893,7 +6893,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start +; GFX1132: .LBB10_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6942,7 +6942,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] @@ -7046,7 +7046,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9-DPP: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] @@ -7135,7 +7135,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] @@ -7218,7 +7218,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] @@ -7307,7 +7307,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] @@ -7389,7 +7389,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] @@ -7439,7 +7439,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX7LESS: .LBB11_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -7487,7 +7487,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9: .LBB11_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -7528,7 +7528,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1064: .LBB11_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -7568,7 +7568,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1032: .LBB11_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -7610,7 +7610,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164: .LBB11_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -7651,7 +7651,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132: .LBB11_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -7696,7 +7696,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB11_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -7744,7 +7744,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9-DPP: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -7785,7 +7785,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -7825,7 +7825,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -7867,7 +7867,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -7908,7 +7908,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -7959,7 +7959,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7LESS-NEXT: .LBB12_1: ; %ComputeLoop +; GFX7LESS: .LBB12_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 @@ -7984,7 +7984,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start +; GFX7LESS: .LBB12_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -8036,7 +8036,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-NEXT: .LBB12_1: ; %ComputeLoop +; GFX9: .LBB12_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -8059,7 +8059,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start +; GFX9: .LBB12_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -8106,7 +8106,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1064: .LBB12_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 @@ -8129,7 +8129,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start +; GFX1064: .LBB12_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -8176,7 +8176,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1032: .LBB12_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 @@ -8198,7 +8198,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start +; GFX1032: .LBB12_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -8235,7 +8235,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1164: .LBB12_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -8262,7 +8262,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start +; GFX1164: .LBB12_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -8300,7 +8300,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1132: .LBB12_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -8325,7 +8325,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start +; GFX1132: .LBB12_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -8374,7 +8374,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB12_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] @@ -8478,7 +8478,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-DPP: .LBB12_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] @@ -8567,7 +8567,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB12_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] @@ -8650,7 +8650,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] @@ -8739,7 +8739,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB12_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] @@ -8821,7 +8821,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] @@ -8871,7 +8871,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -8919,7 +8919,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9: .LBB13_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -8960,7 +8960,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064: .LBB13_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -9000,7 +9000,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -9042,7 +9042,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164: .LBB13_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -9083,7 +9083,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -9128,7 +9128,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -9176,7 +9176,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-DPP: .LBB13_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -9217,7 +9217,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB13_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -9257,7 +9257,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -9299,7 +9299,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB13_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -9340,7 +9340,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -9392,7 +9392,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7LESS-NEXT: .LBB14_1: ; %ComputeLoop +; GFX7LESS: .LBB14_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 @@ -9417,7 +9417,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX7LESS: .LBB14_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -9469,7 +9469,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-NEXT: .LBB14_1: ; %ComputeLoop +; GFX9: .LBB14_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -9492,7 +9492,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX9: .LBB14_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -9539,7 +9539,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1064: .LBB14_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 @@ -9562,7 +9562,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX1064: .LBB14_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -9609,7 +9609,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1032: .LBB14_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 @@ -9631,7 +9631,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX1032: .LBB14_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -9668,7 +9668,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1164: .LBB14_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -9695,7 +9695,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX1164: .LBB14_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -9733,7 +9733,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1132: .LBB14_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -9758,7 +9758,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start +; GFX1132: .LBB14_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -9807,7 +9807,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB14_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] @@ -9911,7 +9911,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start +; GFX9-DPP: .LBB14_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] @@ -10000,7 +10000,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB14_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] @@ -10083,7 +10083,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB14_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] @@ -10172,7 +10172,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB14_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] @@ -10254,7 +10254,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB14_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] @@ -10307,7 +10307,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7LESS-NEXT: .LBB15_1: ; %ComputeLoop +; GFX7LESS: .LBB15_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 @@ -10332,7 +10332,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX7LESS: .LBB15_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -10384,7 +10384,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-NEXT: .LBB15_1: ; %ComputeLoop +; GFX9: .LBB15_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -10407,7 +10407,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX9: .LBB15_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -10454,7 +10454,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1064: .LBB15_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 @@ -10477,7 +10477,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX1064: .LBB15_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -10524,7 +10524,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1032: .LBB15_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 @@ -10546,7 +10546,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX1032: .LBB15_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -10583,7 +10583,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1164: .LBB15_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -10610,7 +10610,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX1164: .LBB15_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -10648,7 +10648,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1132: .LBB15_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -10673,7 +10673,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX1132: .LBB15_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -10722,7 +10722,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB15_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] @@ -10826,7 +10826,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX9-DPP: .LBB15_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] @@ -10915,7 +10915,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB15_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] @@ -10998,7 +10998,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB15_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] @@ -11087,7 +11087,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB15_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] @@ -11169,7 +11169,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB15_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] @@ -11218,7 +11218,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7LESS: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -11266,7 +11266,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX9: .LBB16_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -11307,7 +11307,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1064: .LBB16_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -11347,7 +11347,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1032: .LBB16_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -11389,7 +11389,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1164: .LBB16_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -11430,7 +11430,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1132: .LBB16_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -11475,7 +11475,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) @@ -11523,7 +11523,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX9-DPP: .LBB16_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -11564,7 +11564,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB16_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -11604,7 +11604,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB16_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -11646,7 +11646,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB16_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -11687,7 +11687,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB16_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -11739,7 +11739,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7LESS-NEXT: .LBB17_1: ; %ComputeLoop +; GFX7LESS: .LBB17_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 @@ -11764,7 +11764,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX7LESS: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -11816,7 +11816,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-NEXT: .LBB17_1: ; %ComputeLoop +; GFX9: .LBB17_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 @@ -11839,7 +11839,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX9: .LBB17_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -11886,7 +11886,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1064: .LBB17_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 @@ -11909,7 +11909,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX1064: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -11956,7 +11956,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1032: .LBB17_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 @@ -11978,7 +11978,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX1032: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -12015,7 +12015,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1164: .LBB17_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] @@ -12042,7 +12042,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX1164: .LBB17_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -12080,7 +12080,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1132: .LBB17_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 @@ -12105,7 +12105,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX1132: .LBB17_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -12154,7 +12154,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-DPP: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] @@ -12258,7 +12258,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] -; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start +; GFX9-DPP: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] @@ -12347,7 +12347,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start +; GFX1064-DPP: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] @@ -12430,7 +12430,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[9:10], v2, s[0:1] -; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start +; GFX1032-DPP: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[7:8], v[9:10], -v[0:1] @@ -12519,7 +12519,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start +; GFX1164-DPP: .LBB17_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] @@ -12601,7 +12601,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] -; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start +; GFX1132-DPP: .LBB17_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll index fe462fb44b4ab..550970bbb680d 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -9,6 +9,7 @@ define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid) ; SI-NEXT: ; implicit-def: $sgpr6_sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 ; SI-NEXT: s_branch .LBB0_3 +; SI-NEXT: .p2align ; SI-NEXT: .LBB0_1: ; in Loop: Header=BB0_3 Depth=1 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: .LBB0_2: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 56ceba258f471..faa05f0d930a5 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -88,6 +88,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_mov_b32 s69, s68 @@ -171,6 +172,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_sub_f32_e32 v1, v4, v3 ; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_8: ; %bb33 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_f32_e32 v2, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 835818fb2fd15..229a501295ee4 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -22,6 +22,7 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s8, s5, s4 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_not_b32 s10, s5 @@ -69,6 +70,7 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-NEXT: s_add_i32 s8, s4, s5 ; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB0_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -185,6 +187,7 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s8, s5, s4 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_not_b32 s10, s5 @@ -230,6 +233,7 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-NEXT: s_add_i32 s8, s4, s5 ; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB1_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_not_b32 s9, s5 @@ -343,6 +347,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 ; GFX9-NEXT: s_add_i32 s5, s6, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB2_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_mul_hi_u32 s6, s3, s5 @@ -387,6 +392,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_mul_hi_u32 s6, s5, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_add_i32 s5, s5, s6 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB2_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_mul_hi_u32 s6, s4, s5 @@ -498,6 +504,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s4, s5, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4 @@ -537,6 +544,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_mul_hi_u32 s5, s4, s3 ; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: s_add_i32 s4, s4, s5 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB3_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_mul_hi_u32 s5, s3, s4 @@ -632,6 +640,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s2 @@ -663,6 +672,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s2 @@ -745,6 +755,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_and_b32 s4, 0xffff, s3 @@ -778,6 +789,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s3 @@ -865,6 +877,7 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 @@ -902,6 +915,7 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB6_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_sext_i32_i16 s4, s3 @@ -996,6 +1010,7 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_sext_i32_i16 s6, s3 @@ -1035,6 +1050,7 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB7_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_sext_i32_i16 s4, s3 diff --git a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll index f582f984a3924..fed9aca10c4f2 100644 --- a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll +++ b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll @@ -13,6 +13,7 @@ define amdgpu_kernel void @func(ptr addrspace(1) %in, ptr addrspace(3) %out) { ; CHECK-NEXT: s_mov_b32 s3, 32 ; CHECK-NEXT: s_mov_b32 s2, 0 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 17a5f520ff41e..477d2b13f1bc9 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -9008,6 +9008,7 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: s_branch .LBB26_2 +; GENERIC-NEXT: .p2align ; GENERIC-NEXT: .LBB26_1: ; %Flow ; GENERIC-NEXT: ; in Loop: Header=BB26_2 Depth=1 ; GENERIC-NEXT: s_andn2_b64 vcc, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll index 0a493e5188ad5..d5d0e3caf18a8 100644 --- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll @@ -117,6 +117,7 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB3_1: ; %bb0 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index 3e2e43faca5aa..58059ba227053 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -10,6 +10,7 @@ define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) { ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB0_1: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -48,6 +49,7 @@ define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB1_2: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,6 +94,7 @@ define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x378 ; SI-NEXT: s_and_b64 vcc, exec, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB2_2: ; %loop2 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -112,6 +115,7 @@ define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB2_6: ; %loop1 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -160,6 +164,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-NEXT: ; =>This Loop Header: Depth=1 ; SI-NEXT: ; Child Loop BB3_3 Depth 2 ; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB3_3: ; %inner_loop ; SI-NEXT: ; Parent Loop BB3_2 Depth=1 ; SI-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index eb5c5ef15ed56..3a76691560ab6 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -172,6 +172,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_cbranch_vccz .LBB2_12 ; GFX11-NEXT: ; %bb.9: ; GFX11-NEXT: s_xor_b32 s0, s8, -1 +; GFX11-NEXT: .p2align 5 ; GFX11-NEXT: .LBB2_10: ; %bb17 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll index 58cd2f5bc11af..5e01190afd6bb 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll @@ -32,7 +32,7 @@ define fastcc i32 @foo() { ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $vcc_lo = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1 (%ir-block.1): + ; CHECK-NEXT: bb.1 (%ir-block.1, align 32): ; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; CHECK-NEXT: liveins: $vcc_lo ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index a3b0a7768ca67..b4249bc5eb835 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -16,6 +16,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX9-NEXT: flat_load_dword v4, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v2 @@ -36,6 +37,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 @@ -56,6 +58,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 @@ -78,6 +81,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX9-FLATSCR-NEXT: flat_load_dword v4, v[0:1] ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-FLATSCR-NEXT: .p2align ; GFX9-FLATSCR-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-FLATSCR-NEXT: v_add_f32_e32 v3, v4, v2 @@ -98,6 +102,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -143,6 +148,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX9-NEXT: global_load_dword v2, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v2 @@ -166,6 +172,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX90A-NEXT: global_load_dword v2, v[0:1], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 @@ -191,6 +198,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX10-NEXT: global_load_dword v2, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 @@ -216,6 +224,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX9-FLATSCR-NEXT: global_load_dword v2, v[0:1], off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-FLATSCR-NEXT: .p2align ; GFX9-FLATSCR-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v2 @@ -239,6 +248,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX11-NEXT: global_load_b32 v2, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 @@ -270,6 +280,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: global_load_b32 v2, v[0:1], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll index 300124848c1aa..0a013bdfbbeb2 100644 --- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll @@ -33,6 +33,7 @@ define amdgpu_gfx [13 x i32] @issue130120() { ; CHECK-NEXT: s_mov_b32 s46, 1 ; CHECK-NEXT: s_movk_i32 s45, 0x990 ; CHECK-NEXT: s_mov_b32 s48, 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_1: ; %bb3 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_cmp_eq_u32 s46, 0 diff --git a/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll b/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll index f18a657b8082d..2071cc49544f7 100644 --- a/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll +++ b/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll @@ -20,6 +20,7 @@ define amdgpu_kernel void @stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b9 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB0_2: ; %.lr.ph ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll index 684e3257a1290..01a80f5840ba4 100644 --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -19,6 +19,7 @@ define amdgpu_ps void @return_void(float %0) #0 { ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -63,6 +64,7 @@ define amdgpu_ps void @return_void_compr(float %0) #0 { ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB1_3 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB1_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -102,6 +104,7 @@ define amdgpu_ps void @only_kill() #0 { ; CHECK-LABEL: only_kill: ; CHECK: ; %bb.0: ; %main_body ; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB2_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -135,6 +138,7 @@ define amdgpu_ps float @return_nonvoid(float %0) #0 { ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB3_3 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB3_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll index 0a2e7afa3d417..7d0423d568a01 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -1404,6 +1404,7 @@ define amdgpu_ps void @test_scc_liveness() #0 { ; SI: ; %bb.0: ; %main_body ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB25_1: ; %loop3 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_cmp_gt_i32 s2, 0 @@ -1429,6 +1430,7 @@ define amdgpu_ps void @test_scc_liveness() #0 { ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b64 s[0:1], exec ; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB25_1: ; %loop3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_cmp_gt_i32 s2, 0 @@ -1454,6 +1456,7 @@ define amdgpu_ps void @test_scc_liveness() #0 { ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: s_mov_b64 s[0:1], exec ; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB25_1: ; %loop3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_cmp_gt_i32 s2, 0 @@ -1479,6 +1482,7 @@ define amdgpu_ps void @test_scc_liveness() #0 { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_mov_b64 s[0:1], exec ; GFX12-NEXT: s_mov_b32 s2, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB25_1: ; %loop3 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_cmp_gt_i32 s2, 0 @@ -1531,6 +1535,7 @@ define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, ; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: v_mov_b32_e32 v0, 0x3fc00000 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB26_2: ; %bb ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_and_b64 vcc, exec, s[0:1] @@ -1563,6 +1568,7 @@ define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, ; GFX10-NEXT: s_mov_b64 s[2:3], exec ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX10-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB26_2: ; %bb ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_add_f32_e32 v0, 0x3e800000, v0 @@ -1595,6 +1601,7 @@ define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, ; GFX11-NEXT: s_mov_b64 s[2:3], exec ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX11-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB26_2: ; %bb ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_add_f32_e32 v0, 0x3e800000, v0 @@ -1630,6 +1637,7 @@ define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, ; GFX12-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX12-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB26_2: ; %bb ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_add_f32 s4, s4, 0x3e800000 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll index dbe95a8091932..595db95658a37 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll @@ -38,6 +38,7 @@ define amdgpu_ps void @test(ptr addrspace(1) inreg %ptr) { define amdgpu_ps void @test_loop() { ; GFX9-LABEL: test_loop: ; GFX9: ; %bb.0: +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB1_1: ; %loop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_mov_b32 s0, src_pops_exiting_wave_id @@ -48,6 +49,7 @@ define amdgpu_ps void @test_loop() { ; ; GFX10-LABEL: test_loop: ; GFX10: ; %bb.0: +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB1_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_mov_b32 s0, src_pops_exiting_wave_id diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll index a2c1545743039..971047f0be725 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll @@ -16,6 +16,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) { ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB0_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -35,6 +36,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) { ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB0_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -64,6 +66,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) { ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB1_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +86,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) { ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB1_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -111,6 +115,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) { ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB2_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -131,6 +136,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) { ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_mov_b32 s5, 4 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB2_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -159,6 +165,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) { ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB3_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -178,6 +185,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) { ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB3_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -211,6 +219,7 @@ define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) { ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB4_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo @@ -231,6 +240,7 @@ define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) { ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB4_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_and_b32 s1, exec_lo, vcc_lo @@ -257,6 +267,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) { ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB5_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -277,6 +288,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) { ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB5_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -307,6 +319,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) { ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB6_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -326,6 +339,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) { ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB6_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -356,6 +370,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-SDAG-TRUE16-NEXT: .p2align ; GFX11-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1 ; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -377,6 +392,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: .p2align ; GFX11-FAKE16-NEXT: .LBB7_1: ; %bb1 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -398,6 +414,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-GISEL-TRUE16-NEXT: .p2align ; GFX11-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1 ; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -417,6 +434,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX11-GISEL-NEXT: .p2align ; GFX11-GISEL-NEXT: .LBB7_1: ; %bb1 ; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -439,6 +457,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-SDAG-TRUE16-NEXT: .p2align ; GFX12-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1 ; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 @@ -461,6 +480,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB7_1: ; %bb1 ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -483,6 +503,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-TRUE16-NEXT: .p2align ; GFX12-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1 ; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 @@ -515,6 +536,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) { ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB8_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -534,6 +556,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) { ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB8_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -564,6 +587,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) { ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB9_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -585,6 +609,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) { ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB9_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll index 6f7c001e03e26..096022ce2dc27 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll @@ -16,6 +16,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) % ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB0_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -35,6 +36,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) % ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB0_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -64,6 +66,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) % ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB1_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +86,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) % ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB1_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -111,6 +115,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB2_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -131,6 +136,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_mov_b32 s5, 4 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB2_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -159,6 +165,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) % ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB3_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -178,6 +185,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) % ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB3_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -211,6 +219,7 @@ define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %p ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB4_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo @@ -231,6 +240,7 @@ define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %p ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB4_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_and_b32 s1, exec_lo, vcc_lo @@ -257,6 +267,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB5_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -277,6 +288,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB5_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -307,6 +319,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %pt ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB6_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -326,6 +339,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %pt ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB6_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -356,6 +370,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt ; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-SDAG-TRUE16-NEXT: .p2align ; GFX11-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1 ; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -377,6 +392,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: .p2align ; GFX11-FAKE16-NEXT: .LBB7_1: ; %bb1 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -398,6 +414,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-GISEL-TRUE16-NEXT: .p2align ; GFX11-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1 ; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -417,6 +434,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX11-GISEL-NEXT: .p2align ; GFX11-GISEL-NEXT: .LBB7_1: ; %bb1 ; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -439,6 +457,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt ; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-SDAG-TRUE16-NEXT: .p2align ; GFX12-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1 ; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 @@ -461,6 +480,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB7_1: ; %bb1 ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -483,6 +503,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt ; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-TRUE16-NEXT: .p2align ; GFX12-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1 ; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 @@ -515,6 +536,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %pt ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB8_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -534,6 +556,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %pt ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB8_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -564,6 +587,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB9_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -585,6 +609,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB9_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll index 88963643218a5..e174cccd74ebf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll @@ -20,6 +20,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %i ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB0_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc @@ -42,6 +43,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %i ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB0_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT @@ -70,6 +72,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_const_idx(<4 x i32> %ad ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB1_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -90,6 +93,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_const_idx(<4 x i32> %ad ; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB1_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -123,6 +127,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i3 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB2_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc @@ -145,6 +150,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i3 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB2_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT @@ -177,6 +183,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB3_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc @@ -200,6 +207,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i ; GFX12-NEXT: s_mov_b32 s5, 4 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB3_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], s5 idxen offset:4 th:TH_LOAD_NT @@ -231,6 +239,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i3 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB4_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc @@ -253,6 +262,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i3 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB4_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT_RT @@ -287,6 +297,7 @@ define amdgpu_kernel void @struct_nonatomic_buffer_load_i32(<4 x i32> %addr, i32 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB5_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo @@ -310,6 +321,7 @@ define amdgpu_kernel void @struct_nonatomic_buffer_load_i32(<4 x i32> %addr, i32 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB5_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_and_b32 s1, exec_lo, vcc_lo @@ -340,6 +352,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB6_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc @@ -363,6 +376,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB6_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT @@ -396,6 +410,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB7_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc @@ -418,6 +433,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB7_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT @@ -451,6 +467,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-SDAG-TRUE16-NEXT: .p2align ; GFX11-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc @@ -475,6 +492,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-FAKE16-NEXT: .p2align ; GFX11-FAKE16-NEXT: .LBB8_1: ; %bb1 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc @@ -499,6 +517,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-GISEL-TRUE16-NEXT: .p2align ; GFX11-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc @@ -521,6 +540,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; GFX11-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-GISEL-NEXT: .p2align ; GFX11-GISEL-NEXT: .LBB8_1: ; %bb1 ; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc @@ -546,6 +566,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-SDAG-TRUE16-NEXT: .p2align ; GFX12-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT @@ -571,6 +592,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB8_1: ; %bb1 ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT @@ -596,6 +618,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-GISEL-TRUE16-NEXT: .p2align ; GFX12-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT @@ -631,6 +654,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB9_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc @@ -653,6 +677,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB9_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT @@ -686,6 +711,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %i ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB10_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc @@ -710,6 +736,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %i ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB10_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll index 23db2479f66bb..c0227d28f832f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll @@ -20,6 +20,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %p ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB0_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc @@ -42,6 +43,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %p ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB0_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT @@ -70,6 +72,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_const_idx(ptr addrs ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB1_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -90,6 +93,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_const_idx(ptr addrs ; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_xcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB1_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -123,6 +127,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB2_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc @@ -145,6 +150,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB2_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT @@ -177,6 +183,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB3_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc @@ -200,6 +207,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace( ; GFX12-NEXT: s_mov_b32 s5, 4 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB3_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], s5 idxen offset:4 th:TH_LOAD_NT @@ -231,6 +239,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB4_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc @@ -253,6 +262,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB4_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT_RT @@ -287,6 +297,7 @@ define amdgpu_kernel void @struct_ptr_nonatomic_buffer_load_i32(ptr addrspace(8) ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB5_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo @@ -310,6 +321,7 @@ define amdgpu_kernel void @struct_ptr_nonatomic_buffer_load_i32(ptr addrspace(8) ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB5_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_and_b32 s1, exec_lo, vcc_lo @@ -340,6 +352,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB6_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc @@ -363,6 +376,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB6_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT @@ -396,6 +410,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB7_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc @@ -418,6 +433,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB7_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT @@ -451,6 +467,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-SDAG-TRUE16-NEXT: .p2align ; GFX11-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc @@ -475,6 +492,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-FAKE16-NEXT: .p2align ; GFX11-FAKE16-NEXT: .LBB8_1: ; %bb1 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc @@ -499,6 +517,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-GISEL-TRUE16-NEXT: .p2align ; GFX11-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc @@ -521,6 +540,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; GFX11-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-GISEL-NEXT: .p2align ; GFX11-GISEL-NEXT: .LBB8_1: ; %bb1 ; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc @@ -546,6 +566,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-SDAG-TRUE16-NEXT: .p2align ; GFX12-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT @@ -571,6 +592,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB8_1: ; %bb1 ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT @@ -596,6 +618,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-GISEL-TRUE16-NEXT: .p2align ; GFX12-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT @@ -631,6 +654,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB9_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc @@ -653,6 +677,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB9_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT @@ -686,6 +711,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %p ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB10_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc @@ -710,6 +736,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %p ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB10_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index 7d3b316915923..bc3192ed863c0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -902,6 +902,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_branch .LBB7_5 +; SI-NEXT: .p2align ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -968,6 +969,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_branch .LBB7_5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1034,6 +1036,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: s_mov_b32 s2, 0 ; GFX10-32-NEXT: s_branch .LBB7_5 +; GFX10-32-NEXT: .p2align ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -1097,6 +1100,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-64-NEXT: s_branch .LBB7_5 +; GFX10-64-NEXT: .p2align ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index c1a32aafbc71e..bf40e2dd3cf1c 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -90,7 +90,7 @@ define float @local_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -112,7 +112,7 @@ define float @local_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX6: .LBB0_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v1 @@ -206,7 +206,7 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -229,7 +229,7 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX6: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 @@ -323,7 +323,7 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7: .LBB2_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 @@ -344,7 +344,7 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX6: .LBB2_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 @@ -437,7 +437,7 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 @@ -459,7 +459,7 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX6: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 @@ -493,7 +493,7 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: ds_load_b64 v[0:1], v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX12: .LBB4_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 @@ -528,7 +528,7 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: ds_load_b64 v[0:1], v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX11: .LBB4_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 @@ -553,7 +553,7 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: ds_read_b64 v[0:1], v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX10: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v1 @@ -586,7 +586,7 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: ds_read_b64 v[0:1], v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v1 @@ -609,7 +609,7 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: ds_read_b64 v[0:1], v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v1 @@ -632,7 +632,7 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: ds_read_b64 v[0:1], v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v1 @@ -655,7 +655,7 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: ds_read_b64 v[0:1], v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX6: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v1 @@ -685,7 +685,7 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: ds_load_b64 v[0:1], v0 offset:65528 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 @@ -720,7 +720,7 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:65528 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 @@ -745,7 +745,7 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX10: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v1 @@ -778,7 +778,7 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v1 @@ -801,7 +801,7 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v1 @@ -824,7 +824,7 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7: .LBB5_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v1 @@ -847,7 +847,7 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b64 v[0:1], v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX6: .LBB5_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v1 @@ -877,7 +877,7 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b64 v[1:2], v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] @@ -910,7 +910,7 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b64 v[1:2], v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 @@ -933,7 +933,7 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b64 v[1:2], v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 @@ -965,7 +965,7 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b64 v[1:2], v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 @@ -987,7 +987,7 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b64 v[1:2], v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 @@ -1009,7 +1009,7 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b64 v[1:2], v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 @@ -1031,7 +1031,7 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b64 v[1:2], v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX6: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 @@ -1060,7 +1060,7 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b64 v[1:2], v0 offset:65528 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] @@ -1093,7 +1093,7 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b64 v[1:2], v0 offset:65528 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX11: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 @@ -1116,7 +1116,7 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b64 v[1:2], v0 offset:65528 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 @@ -1148,7 +1148,7 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b64 v[1:2], v0 offset:65528 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 @@ -1170,7 +1170,7 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b64 v[1:2], v0 offset:65528 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 @@ -1192,7 +1192,7 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b64 v[1:2], v0 offset:65528 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 @@ -1215,7 +1215,7 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b64 v[0:1], v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX6: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f64 v[3:4], v[0:1], 4.0 @@ -1255,7 +1255,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB8_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1297,7 +1297,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB8_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 @@ -1336,7 +1336,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v3, v3 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 @@ -1366,7 +1366,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB8_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1402,7 +1402,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB8_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 @@ -1438,7 +1438,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -1470,7 +1470,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v3, v3 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 @@ -1500,7 +1500,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v3, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 @@ -1531,7 +1531,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1562,7 +1562,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -1596,7 +1596,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -1640,7 +1640,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB9_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1684,7 +1684,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB9_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1724,7 +1724,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshlrev_b32_e64 v3, v0, s0 ; GFX942-NEXT: v_not_b32_e32 v3, v3 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 @@ -1756,7 +1756,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB9_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1794,7 +1794,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB9_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1831,7 +1831,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -1864,7 +1864,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v0, s4 ; GFX90A-NEXT: v_not_b32_e32 v3, v3 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 @@ -1895,7 +1895,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v0, s4 ; GFX908-NEXT: v_not_b32_e32 v3, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 @@ -1927,7 +1927,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1959,7 +1959,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -1994,7 +1994,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -2037,7 +2037,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -2078,7 +2078,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2116,7 +2116,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -2145,7 +2145,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -2180,7 +2180,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2215,7 +2215,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 @@ -2246,7 +2246,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -2275,7 +2275,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -2305,7 +2305,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -2335,7 +2335,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -2367,7 +2367,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -2409,7 +2409,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2452,7 +2452,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2490,7 +2490,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: v_lshlrev_b32_e64 v2, v1, s0 ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2521,7 +2521,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2558,7 +2558,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2593,7 +2593,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2625,7 +2625,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2655,7 +2655,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2686,7 +2686,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2717,7 +2717,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2750,7 +2750,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX6: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2785,7 +2785,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB12_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 @@ -2818,7 +2818,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB12_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 @@ -2849,7 +2849,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, v1 @@ -2871,7 +2871,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB12_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 @@ -2898,7 +2898,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB12_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 @@ -2926,7 +2926,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -2952,7 +2952,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 @@ -2975,7 +2975,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX908-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 @@ -2998,7 +2998,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 @@ -3022,7 +3022,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -3049,7 +3049,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v1 @@ -3083,7 +3083,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l @@ -3115,7 +3115,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1 @@ -3144,7 +3144,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_add_f16_e32 v2, 4.0, v1 @@ -3165,7 +3165,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l @@ -3191,7 +3191,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1 @@ -3217,7 +3217,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f16_e32 v2, 4.0, v1 @@ -3242,7 +3242,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v1 @@ -3264,7 +3264,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_add_f16_e32 v2, 4.0, v1 @@ -3286,7 +3286,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_f16_e32 v2, 4.0, v1 @@ -3309,7 +3309,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 @@ -3335,7 +3335,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 @@ -3378,7 +3378,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3431,7 +3431,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3480,7 +3480,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v3 @@ -3609,7 +3609,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -3647,7 +3647,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 @@ -3683,7 +3683,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -3719,7 +3719,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -3756,7 +3756,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -3790,7 +3790,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX6: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -3834,7 +3834,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB15_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3889,7 +3889,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB15_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3939,7 +3939,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v3 @@ -4073,7 +4073,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -4112,7 +4112,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 @@ -4149,7 +4149,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -4186,7 +4186,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -4224,7 +4224,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -4259,7 +4259,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6: .LBB15_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -4302,7 +4302,7 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4354,7 +4354,7 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4402,7 +4402,7 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4528,7 +4528,7 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4565,7 +4565,7 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4600,7 +4600,7 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4635,7 +4635,7 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4671,7 +4671,7 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -4703,7 +4703,7 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -4745,7 +4745,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -4798,7 +4798,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -4846,7 +4846,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4975,7 +4975,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5013,7 +5013,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5049,7 +5049,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5085,7 +5085,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5122,7 +5122,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -5155,7 +5155,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -5190,7 +5190,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB18_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 @@ -5234,7 +5234,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB18_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 @@ -5275,7 +5275,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX942: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, v1 @@ -5380,7 +5380,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -5413,7 +5413,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 @@ -5444,7 +5444,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 @@ -5474,7 +5474,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 @@ -5505,7 +5505,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -5532,7 +5532,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX6: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v1 @@ -5566,7 +5566,7 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5608,7 +5608,7 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5647,7 +5647,7 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX942: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5747,7 +5747,7 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5779,7 +5779,7 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX90A: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5809,7 +5809,7 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5838,7 +5838,7 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5868,7 +5868,7 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5894,7 +5894,7 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX6: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5948,7 +5948,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 @@ -5973,7 +5973,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 @@ -5996,7 +5996,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX90A: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 @@ -6017,7 +6017,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v2 @@ -6039,7 +6039,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 @@ -6071,7 +6071,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -6113,7 +6113,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX6: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -6172,7 +6172,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 @@ -6197,7 +6197,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX10: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 @@ -6220,7 +6220,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX90A: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 @@ -6241,7 +6241,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v2 @@ -6263,7 +6263,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 @@ -6295,7 +6295,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX7: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -6338,7 +6338,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX6: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -6396,7 +6396,7 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 @@ -6419,7 +6419,7 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 @@ -6441,7 +6441,7 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 @@ -6461,7 +6461,7 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 @@ -6482,7 +6482,7 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -6513,7 +6513,7 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6554,7 +6554,7 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6611,7 +6611,7 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 @@ -6634,7 +6634,7 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 @@ -6656,7 +6656,7 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 @@ -6676,7 +6676,7 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 @@ -6697,7 +6697,7 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -6728,7 +6728,7 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6770,7 +6770,7 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6930,7 +6930,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -6971,7 +6971,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 @@ -7010,7 +7010,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 @@ -7048,7 +7048,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 @@ -7095,7 +7095,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7134,7 +7134,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7287,7 +7287,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -7328,7 +7328,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 @@ -7367,7 +7367,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 @@ -7405,7 +7405,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 @@ -7452,7 +7452,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7492,7 +7492,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -7640,7 +7640,7 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7680,7 +7680,7 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7718,7 +7718,7 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7755,7 +7755,7 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7800,7 +7800,7 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -7837,7 +7837,7 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -7984,7 +7984,7 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8024,7 +8024,7 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8062,7 +8062,7 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8099,7 +8099,7 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8144,7 +8144,7 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -8182,7 +8182,7 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -8270,7 +8270,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v1, v0, s3, vcc_lo ; GFX12-NEXT: ; implicit-def: $vgpr0 -; GFX12-NEXT: .LBB28_5: ; %ComputeLoop +; GFX12: .LBB28_5: ; %ComputeLoop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_ctz_i32_b32 s3, s1 @@ -8360,7 +8360,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX942-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX942-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX942-NEXT: ; implicit-def: $vgpr0 -; GFX942-NEXT: .LBB28_5: ; %ComputeLoop +; GFX942: .LBB28_5: ; %ComputeLoop ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 @@ -8449,7 +8449,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, s3, vcc_lo ; GFX11-NEXT: ; implicit-def: $vgpr0 -; GFX11-NEXT: .LBB28_5: ; %ComputeLoop +; GFX11: .LBB28_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -8535,7 +8535,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: v_add_f32_e32 v0, s3, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s3, vcc_lo ; GFX10-NEXT: ; implicit-def: $vgpr0 -; GFX10-NEXT: .LBB28_5: ; %ComputeLoop +; GFX10: .LBB28_5: ; %ComputeLoop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 @@ -8618,7 +8618,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX90A-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop +; GFX90A: .LBB28_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 @@ -8701,7 +8701,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX908-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX908-NEXT: ; implicit-def: $vgpr0 -; GFX908-NEXT: .LBB28_5: ; %ComputeLoop +; GFX908: .LBB28_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 @@ -8785,7 +8785,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: .LBB28_5: ; %ComputeLoop +; GFX8: .LBB28_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 @@ -8844,7 +8844,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB28_2: ; %atomicrmw.start +; GFX7: .LBB28_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v1 @@ -8874,7 +8874,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 ; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2 +; GFX7: .LBB28_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 @@ -8895,7 +8895,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX7-NEXT: ; implicit-def: $vgpr0 -; GFX7-NEXT: .LBB28_8: ; %ComputeLoop +; GFX7: .LBB28_8: ; %ComputeLoop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s3 @@ -8921,7 +8921,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v2, v3 ; GFX7-NEXT: s_mov_b64 s[2:3], 0 -; GFX7-NEXT: .LBB28_11: ; %atomicrmw.start8 +; GFX7: .LBB28_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v2 @@ -8968,7 +8968,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB28_2: ; %atomicrmw.start +; GFX6: .LBB28_2: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v1 @@ -8998,7 +8998,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2 +; GFX6: .LBB28_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 @@ -9019,7 +9019,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: .LBB28_8: ; %ComputeLoop +; GFX6: .LBB28_8: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 @@ -9045,7 +9045,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v2, v3 ; GFX6-NEXT: s_mov_b64 s[2:3], 0 -; GFX6-NEXT: .LBB28_11: ; %atomicrmw.start8 +; GFX6: .LBB28_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v2 @@ -9135,7 +9135,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v1, v0, s3, vcc_lo ; GFX12-NEXT: ; implicit-def: $vgpr0 -; GFX12-NEXT: .LBB29_5: ; %ComputeLoop +; GFX12: .LBB29_5: ; %ComputeLoop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_ctz_i32_b32 s3, s1 @@ -9221,7 +9221,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX942-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX942-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX942-NEXT: ; implicit-def: $vgpr0 -; GFX942-NEXT: .LBB29_5: ; %ComputeLoop +; GFX942: .LBB29_5: ; %ComputeLoop ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 @@ -9305,7 +9305,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, s3, vcc_lo ; GFX11-NEXT: ; implicit-def: $vgpr0 -; GFX11-NEXT: .LBB29_5: ; %ComputeLoop +; GFX11: .LBB29_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -9384,7 +9384,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: v_add_f32_e32 v0, s3, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s3, vcc_lo ; GFX10-NEXT: ; implicit-def: $vgpr0 -; GFX10-NEXT: .LBB29_5: ; %ComputeLoop +; GFX10: .LBB29_5: ; %ComputeLoop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 @@ -9462,7 +9462,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX90A-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop +; GFX90A: .LBB29_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 @@ -9542,7 +9542,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX908-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX908-NEXT: ; implicit-def: $vgpr0 -; GFX908-NEXT: .LBB29_5: ; %ComputeLoop +; GFX908: .LBB29_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 @@ -9623,7 +9623,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: .LBB29_5: ; %ComputeLoop +; GFX8: .LBB29_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 @@ -9680,7 +9680,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB29_2: ; %atomicrmw.start +; GFX7: .LBB29_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v1 @@ -9710,7 +9710,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 ; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2 +; GFX7: .LBB29_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 @@ -9731,7 +9731,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX7-NEXT: ; implicit-def: $vgpr0 -; GFX7-NEXT: .LBB29_8: ; %ComputeLoop +; GFX7: .LBB29_8: ; %ComputeLoop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s3 @@ -9757,7 +9757,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v2, v3 ; GFX7-NEXT: s_mov_b64 s[2:3], 0 -; GFX7-NEXT: .LBB29_11: ; %atomicrmw.start8 +; GFX7: .LBB29_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v2 @@ -9804,7 +9804,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB29_2: ; %atomicrmw.start +; GFX6: .LBB29_2: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v1 @@ -9834,7 +9834,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2 +; GFX6: .LBB29_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 @@ -9855,7 +9855,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: .LBB29_8: ; %ComputeLoop +; GFX6: .LBB29_8: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 @@ -9881,7 +9881,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v2, v3 ; GFX6-NEXT: s_mov_b64 s[2:3], 0 -; GFX6-NEXT: .LBB29_11: ; %atomicrmw.start8 +; GFX6: .LBB29_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v2 @@ -9993,7 +9993,7 @@ define float @local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -10015,7 +10015,7 @@ define float @local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v1 @@ -10109,7 +10109,7 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 @@ -10130,7 +10130,7 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 739e86d1928b1..d3936dc9659e9 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -799,7 +799,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB8_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -843,7 +843,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB8_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -883,7 +883,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v3 @@ -914,7 +914,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB8_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -952,7 +952,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB8_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -989,7 +989,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -1022,7 +1022,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 @@ -1053,7 +1053,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1085,7 +1085,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1117,7 +1117,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -1151,7 +1151,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -1195,7 +1195,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB9_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1241,7 +1241,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB9_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1282,7 +1282,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshlrev_b32_e64 v2, v0, s0 ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v3 @@ -1315,7 +1315,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB9_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1355,7 +1355,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB9_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1393,7 +1393,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -1427,7 +1427,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 @@ -1459,7 +1459,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1492,7 +1492,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1525,7 +1525,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -1560,7 +1560,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -1603,7 +1603,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1646,7 +1646,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1685,7 +1685,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -1715,7 +1715,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1752,7 +1752,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1788,7 +1788,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 @@ -1820,7 +1820,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -1850,7 +1850,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -1881,7 +1881,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -1912,7 +1912,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -1944,7 +1944,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -1986,7 +1986,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2030,7 +2030,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2070,7 +2070,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: v_lshlrev_b32_e64 v2, v1, s0 ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2102,7 +2102,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2140,7 +2140,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2177,7 +2177,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2210,7 +2210,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2241,7 +2241,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2273,7 +2273,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2305,7 +2305,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2338,7 +2338,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX6: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2373,7 +2373,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB12_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 @@ -2408,7 +2408,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB12_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 @@ -2440,7 +2440,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, v1 @@ -2463,7 +2463,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB12_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 @@ -2492,7 +2492,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB12_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 @@ -2521,7 +2521,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -2548,7 +2548,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 @@ -2572,7 +2572,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX908-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 @@ -2596,7 +2596,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 @@ -2621,7 +2621,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -2648,7 +2648,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v1 @@ -2682,7 +2682,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l @@ -2715,7 +2715,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1 @@ -2746,7 +2746,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_max_f16_e32 v2, v1, v1 @@ -2768,7 +2768,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l @@ -2795,7 +2795,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1 @@ -2823,7 +2823,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v1, v1 @@ -2849,7 +2849,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v1, v1 @@ -2872,7 +2872,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v1, v1 @@ -2895,7 +2895,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 @@ -2919,7 +2919,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 @@ -2945,7 +2945,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 @@ -2988,7 +2988,7 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3041,7 +3041,7 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3090,7 +3090,7 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v3 @@ -3219,7 +3219,7 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -3257,7 +3257,7 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 @@ -3293,7 +3293,7 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -3329,7 +3329,7 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -3366,7 +3366,7 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -3401,7 +3401,7 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX6: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -3446,7 +3446,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB15_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3501,7 +3501,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB15_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3551,7 +3551,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v3 @@ -3685,7 +3685,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -3724,7 +3724,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 @@ -3761,7 +3761,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -3798,7 +3798,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -3836,7 +3836,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -3872,7 +3872,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6: .LBB15_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -3916,7 +3916,7 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3968,7 +3968,7 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4016,7 +4016,7 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4142,7 +4142,7 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4179,7 +4179,7 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4214,7 +4214,7 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4249,7 +4249,7 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4285,7 +4285,7 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -4318,7 +4318,7 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -4361,7 +4361,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -4414,7 +4414,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -4462,7 +4462,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4591,7 +4591,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4629,7 +4629,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4665,7 +4665,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4701,7 +4701,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4738,7 +4738,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -4772,7 +4772,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -4808,7 +4808,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB18_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 @@ -4852,7 +4852,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB18_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 @@ -4893,7 +4893,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX942: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, v1 @@ -4998,7 +4998,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -5031,7 +5031,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 @@ -5062,7 +5062,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 @@ -5092,7 +5092,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 @@ -5123,7 +5123,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -5151,7 +5151,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX6: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v1 @@ -5186,7 +5186,7 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5228,7 +5228,7 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5267,7 +5267,7 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX942: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5367,7 +5367,7 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5399,7 +5399,7 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX90A: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5429,7 +5429,7 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5458,7 +5458,7 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5488,7 +5488,7 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5515,7 +5515,7 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX6: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5554,7 +5554,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ds_load_b32 v2, v0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 @@ -5583,7 +5583,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX942: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v2 @@ -5607,7 +5607,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11-NEXT: ds_load_b32 v2, v0 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 @@ -5634,7 +5634,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 @@ -5659,7 +5659,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX90A: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 @@ -5682,7 +5682,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v2 @@ -5707,7 +5707,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 @@ -5741,7 +5741,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -5783,7 +5783,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX6: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -5826,7 +5826,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 @@ -5855,7 +5855,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX942: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v2 @@ -5879,7 +5879,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 @@ -5906,7 +5906,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX10: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 @@ -5931,7 +5931,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX90A: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 @@ -5954,7 +5954,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v2 @@ -5979,7 +5979,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 @@ -6013,7 +6013,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX7: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -6056,7 +6056,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX6: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -6098,7 +6098,7 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: ds_load_b32 v2, v0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 @@ -6126,7 +6126,7 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX942: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6149,7 +6149,7 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: ds_load_b32 v2, v0 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6175,7 +6175,7 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6199,7 +6199,7 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6221,7 +6221,7 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6245,7 +6245,7 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -6278,7 +6278,7 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6319,7 +6319,7 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6360,7 +6360,7 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 @@ -6388,7 +6388,7 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX942: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6411,7 +6411,7 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6437,7 +6437,7 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6461,7 +6461,7 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6483,7 +6483,7 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6507,7 +6507,7 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -6540,7 +6540,7 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6582,7 +6582,7 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6629,7 +6629,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB24_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 @@ -6683,7 +6683,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB24_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 @@ -6733,7 +6733,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX942: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 @@ -6867,7 +6867,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -6908,7 +6908,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 @@ -6947,7 +6947,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 @@ -6985,7 +6985,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 @@ -7032,7 +7032,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7071,7 +7071,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7111,7 +7111,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB25_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 @@ -7165,7 +7165,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB25_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 @@ -7215,7 +7215,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX942: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 @@ -7349,7 +7349,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -7390,7 +7390,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 @@ -7429,7 +7429,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 @@ -7467,7 +7467,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 @@ -7514,7 +7514,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7554,7 +7554,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -7594,7 +7594,7 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -7645,7 +7645,7 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -7693,7 +7693,7 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7821,7 +7821,7 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7861,7 +7861,7 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7899,7 +7899,7 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7936,7 +7936,7 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7981,7 +7981,7 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -8018,7 +8018,7 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -8057,7 +8057,7 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -8108,7 +8108,7 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -8156,7 +8156,7 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8284,7 +8284,7 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8324,7 +8324,7 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8362,7 +8362,7 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8399,7 +8399,7 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8444,7 +8444,7 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -8482,7 +8482,7 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 6da80262951e5..627286eda640c 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -799,7 +799,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB8_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -843,7 +843,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB8_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -883,7 +883,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v3 @@ -914,7 +914,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB8_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -952,7 +952,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB8_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -989,7 +989,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -1022,7 +1022,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 @@ -1053,7 +1053,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1085,7 +1085,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1117,7 +1117,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -1151,7 +1151,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -1195,7 +1195,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB9_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1241,7 +1241,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB9_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1282,7 +1282,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshlrev_b32_e64 v2, v0, s0 ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v3 @@ -1315,7 +1315,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB9_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1355,7 +1355,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB9_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1393,7 +1393,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -1427,7 +1427,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 @@ -1459,7 +1459,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -1492,7 +1492,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -1525,7 +1525,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -1560,7 +1560,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -1603,7 +1603,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1646,7 +1646,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1685,7 +1685,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -1715,7 +1715,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1752,7 +1752,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1788,7 +1788,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 @@ -1820,7 +1820,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -1850,7 +1850,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -1881,7 +1881,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -1912,7 +1912,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -1944,7 +1944,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -1986,7 +1986,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2030,7 +2030,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2070,7 +2070,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: v_lshlrev_b32_e64 v2, v1, s0 ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2102,7 +2102,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2140,7 +2140,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2177,7 +2177,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2210,7 +2210,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2241,7 +2241,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2273,7 +2273,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2305,7 +2305,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2338,7 +2338,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX6: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2373,7 +2373,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB12_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 @@ -2408,7 +2408,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB12_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 @@ -2440,7 +2440,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, v1 @@ -2463,7 +2463,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB12_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 @@ -2492,7 +2492,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB12_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 @@ -2521,7 +2521,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -2548,7 +2548,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 @@ -2572,7 +2572,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX908-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 @@ -2596,7 +2596,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 @@ -2621,7 +2621,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -2648,7 +2648,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v1 @@ -2682,7 +2682,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l @@ -2715,7 +2715,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1 @@ -2746,7 +2746,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_max_f16_e32 v2, v1, v1 @@ -2768,7 +2768,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l @@ -2795,7 +2795,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1 @@ -2823,7 +2823,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v1, v1 @@ -2849,7 +2849,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v1, v1 @@ -2872,7 +2872,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v1, v1 @@ -2895,7 +2895,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 @@ -2919,7 +2919,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 @@ -2945,7 +2945,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 @@ -2988,7 +2988,7 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3041,7 +3041,7 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3090,7 +3090,7 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v3 @@ -3219,7 +3219,7 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -3257,7 +3257,7 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 @@ -3293,7 +3293,7 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -3329,7 +3329,7 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -3366,7 +3366,7 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -3401,7 +3401,7 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX6: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -3446,7 +3446,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB15_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3501,7 +3501,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB15_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3551,7 +3551,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v3 @@ -3685,7 +3685,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -3724,7 +3724,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 @@ -3761,7 +3761,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -3798,7 +3798,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -3836,7 +3836,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -3872,7 +3872,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6: .LBB15_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -3916,7 +3916,7 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3968,7 +3968,7 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4016,7 +4016,7 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4142,7 +4142,7 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4179,7 +4179,7 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4214,7 +4214,7 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4249,7 +4249,7 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4285,7 +4285,7 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -4318,7 +4318,7 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -4361,7 +4361,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -4414,7 +4414,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -4462,7 +4462,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4591,7 +4591,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4629,7 +4629,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4665,7 +4665,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4701,7 +4701,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4738,7 +4738,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -4772,7 +4772,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -4808,7 +4808,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB18_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 @@ -4852,7 +4852,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB18_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 @@ -4893,7 +4893,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX942: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, v1 @@ -4998,7 +4998,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -5031,7 +5031,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 @@ -5062,7 +5062,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 @@ -5092,7 +5092,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 @@ -5123,7 +5123,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -5151,7 +5151,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX6: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v1 @@ -5186,7 +5186,7 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5228,7 +5228,7 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5267,7 +5267,7 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX942: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5367,7 +5367,7 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5399,7 +5399,7 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX90A: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5429,7 +5429,7 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5458,7 +5458,7 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5488,7 +5488,7 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5515,7 +5515,7 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX6: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -5554,7 +5554,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ds_load_b32 v2, v0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 @@ -5583,7 +5583,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX942: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v2 @@ -5607,7 +5607,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11-NEXT: ds_load_b32 v2, v0 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 @@ -5634,7 +5634,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 @@ -5659,7 +5659,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX90A: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 @@ -5682,7 +5682,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v2 @@ -5707,7 +5707,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 @@ -5741,7 +5741,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -5783,7 +5783,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX6: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -5826,7 +5826,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 @@ -5855,7 +5855,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX942: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v2 @@ -5879,7 +5879,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 @@ -5906,7 +5906,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX10: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 @@ -5931,7 +5931,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX90A: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 @@ -5954,7 +5954,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v2 @@ -5979,7 +5979,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 @@ -6013,7 +6013,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX7: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -6056,7 +6056,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX6: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -6098,7 +6098,7 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: ds_load_b32 v2, v0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 @@ -6126,7 +6126,7 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX942: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6149,7 +6149,7 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: ds_load_b32 v2, v0 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6175,7 +6175,7 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6199,7 +6199,7 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6221,7 +6221,7 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6245,7 +6245,7 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -6278,7 +6278,7 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6319,7 +6319,7 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6360,7 +6360,7 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 @@ -6388,7 +6388,7 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX942: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6411,7 +6411,7 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6437,7 +6437,7 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6461,7 +6461,7 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6483,7 +6483,7 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 @@ -6507,7 +6507,7 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -6540,7 +6540,7 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6582,7 +6582,7 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6629,7 +6629,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB24_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 @@ -6683,7 +6683,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB24_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 @@ -6733,7 +6733,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX942: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 @@ -6867,7 +6867,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -6908,7 +6908,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 @@ -6947,7 +6947,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 @@ -6985,7 +6985,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 @@ -7032,7 +7032,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7071,7 +7071,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7111,7 +7111,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB25_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 @@ -7165,7 +7165,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB25_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 @@ -7215,7 +7215,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX942: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 @@ -7349,7 +7349,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -7390,7 +7390,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 @@ -7429,7 +7429,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 @@ -7467,7 +7467,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 @@ -7514,7 +7514,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7554,7 +7554,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -7594,7 +7594,7 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -7645,7 +7645,7 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -7693,7 +7693,7 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7821,7 +7821,7 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7861,7 +7861,7 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7899,7 +7899,7 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7936,7 +7936,7 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -7981,7 +7981,7 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -8018,7 +8018,7 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -8057,7 +8057,7 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -8108,7 +8108,7 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -8156,7 +8156,7 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8284,7 +8284,7 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8324,7 +8324,7 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8362,7 +8362,7 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8399,7 +8399,7 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8444,7 +8444,7 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -8482,7 +8482,7 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 786989cc9fb57..b3b990918a18e 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -25,7 +25,7 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v1, v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX12: .LBB0_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 @@ -52,7 +52,7 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v1, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, v1 @@ -73,7 +73,7 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v1, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11: .LBB0_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, v1 @@ -98,7 +98,7 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX10: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -121,7 +121,7 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v1, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 @@ -142,7 +142,7 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v1, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX908: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 @@ -164,7 +164,7 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX8: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 @@ -186,7 +186,7 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -208,7 +208,7 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX6: .LBB0_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v1 @@ -237,7 +237,7 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v1, v0 offset:65532 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX12: .LBB1_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 @@ -264,7 +264,7 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, v1 @@ -285,7 +285,7 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v1, v0 offset:65532 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX11: .LBB1_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, v1 @@ -310,7 +310,7 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX10: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -333,7 +333,7 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX90A: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 @@ -354,7 +354,7 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX908: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 @@ -376,7 +376,7 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX8: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 @@ -398,7 +398,7 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -421,7 +421,7 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX6: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 @@ -450,7 +450,7 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v1, v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX12: .LBB2_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -475,7 +475,7 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v1, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX942: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -495,7 +495,7 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v1, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX11: .LBB2_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -518,7 +518,7 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX10: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -540,7 +540,7 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v1, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX90A: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -560,7 +560,7 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v1, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX908: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -581,7 +581,7 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX8: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -602,7 +602,7 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7: .LBB2_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -623,7 +623,7 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX6: .LBB2_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -651,7 +651,7 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v1, v0 offset:65532 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX12: .LBB3_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -676,7 +676,7 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -696,7 +696,7 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v1, v0 offset:65532 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX11: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -719,7 +719,7 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX10: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -741,7 +741,7 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -761,7 +761,7 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX908: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -782,7 +782,7 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX8: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -803,7 +803,7 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65532 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -825,7 +825,7 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX6: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -859,7 +859,7 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: ds_load_b64 v[0:1], v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX12: .LBB4_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 @@ -886,7 +886,7 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: ds_read_b64 v[0:1], v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] @@ -907,7 +907,7 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: ds_load_b64 v[0:1], v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX11: .LBB4_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 @@ -932,7 +932,7 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: ds_read_b64 v[0:1], v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX10: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v1 @@ -956,7 +956,7 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: ds_read_b64 v[0:1], v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] @@ -977,7 +977,7 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: ds_read_b64 v[0:1], v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v1 @@ -1000,7 +1000,7 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: ds_read_b64 v[0:1], v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v1 @@ -1023,7 +1023,7 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: ds_read_b64 v[0:1], v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v1 @@ -1046,7 +1046,7 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: ds_read_b64 v[0:1], v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX6: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v1 @@ -1076,7 +1076,7 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: ds_load_b64 v[0:1], v0 offset:65528 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 @@ -1103,7 +1103,7 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] @@ -1124,7 +1124,7 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:65528 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 @@ -1149,7 +1149,7 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX10: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v1 @@ -1173,7 +1173,7 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX90A: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] @@ -1194,7 +1194,7 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v1 @@ -1217,7 +1217,7 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v1 @@ -1240,7 +1240,7 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7: .LBB5_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v1 @@ -1263,7 +1263,7 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b64 v[0:1], v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX6: .LBB5_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v1 @@ -1293,7 +1293,7 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b64 v[1:2], v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2] @@ -1318,7 +1318,7 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b64 v[2:3], v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX942: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 @@ -1338,7 +1338,7 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b64 v[1:2], v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 @@ -1361,7 +1361,7 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b64 v[1:2], v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 @@ -1384,7 +1384,7 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[2:3], v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 @@ -1404,7 +1404,7 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b64 v[1:2], v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 @@ -1426,7 +1426,7 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b64 v[1:2], v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 @@ -1448,7 +1448,7 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b64 v[1:2], v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 @@ -1470,7 +1470,7 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b64 v[1:2], v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX6: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 @@ -1499,7 +1499,7 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b64 v[1:2], v0 offset:65528 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2] @@ -1524,7 +1524,7 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b64 v[2:3], v0 offset:65528 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX942: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 @@ -1544,7 +1544,7 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b64 v[1:2], v0 offset:65528 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX11: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 @@ -1567,7 +1567,7 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b64 v[1:2], v0 offset:65528 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 @@ -1590,7 +1590,7 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[2:3], v0 offset:65528 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 @@ -1610,7 +1610,7 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b64 v[1:2], v0 offset:65528 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 @@ -1632,7 +1632,7 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b64 v[1:2], v0 offset:65528 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 @@ -1654,7 +1654,7 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b64 v[1:2], v0 offset:65528 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 @@ -1677,7 +1677,7 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b64 v[0:1], v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX6: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f64 v[3:4], v[0:1], -4.0 @@ -1717,7 +1717,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB8_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1759,7 +1759,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB8_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 @@ -1798,7 +1798,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v3, v3 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 @@ -1828,7 +1828,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB8_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -1864,7 +1864,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB8_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 @@ -1900,7 +1900,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -1932,7 +1932,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v3, v3 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 @@ -1962,7 +1962,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v3, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 @@ -1993,7 +1993,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -2024,7 +2024,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -2058,7 +2058,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -2102,7 +2102,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB9_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -2146,7 +2146,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB9_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -2186,7 +2186,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshlrev_b32_e64 v3, v0, s0 ; GFX942-NEXT: v_not_b32_e32 v3, v3 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 @@ -2218,7 +2218,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB9_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -2256,7 +2256,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB9_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -2293,7 +2293,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -2326,7 +2326,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v0, s4 ; GFX90A-NEXT: v_not_b32_e32 v3, v3 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 @@ -2357,7 +2357,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v0, s4 ; GFX908-NEXT: v_not_b32_e32 v3, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 @@ -2389,7 +2389,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -2421,7 +2421,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -2456,7 +2456,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -2499,7 +2499,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -2540,7 +2540,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2578,7 +2578,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -2607,7 +2607,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -2642,7 +2642,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2677,7 +2677,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 @@ -2708,7 +2708,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -2737,7 +2737,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -2767,7 +2767,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -2797,7 +2797,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -2829,7 +2829,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -2871,7 +2871,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2914,7 +2914,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2952,7 +2952,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: v_lshlrev_b32_e64 v2, v1, s0 ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -2983,7 +2983,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -3020,7 +3020,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -3055,7 +3055,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -3087,7 +3087,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -3117,7 +3117,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -3148,7 +3148,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -3179,7 +3179,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -3212,7 +3212,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX6: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -3247,7 +3247,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB12_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 @@ -3280,7 +3280,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB12_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 @@ -3311,7 +3311,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, v1 @@ -3333,7 +3333,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB12_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 @@ -3360,7 +3360,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB12_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 @@ -3388,7 +3388,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -3414,7 +3414,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 @@ -3437,7 +3437,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX908-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 @@ -3460,7 +3460,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 @@ -3484,7 +3484,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -3511,7 +3511,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v1 @@ -3545,7 +3545,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l @@ -3577,7 +3577,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1 @@ -3606,7 +3606,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_add_f16_e32 v2, -4.0, v1 @@ -3627,7 +3627,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-TRUE16: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l @@ -3653,7 +3653,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-FAKE16: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1 @@ -3679,7 +3679,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f16_e32 v2, -4.0, v1 @@ -3704,7 +3704,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v1 @@ -3726,7 +3726,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_add_f16_e32 v2, -4.0, v1 @@ -3748,7 +3748,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_f16_e32 v2, -4.0, v1 @@ -3771,7 +3771,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 @@ -3797,7 +3797,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 @@ -3840,7 +3840,7 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3893,7 +3893,7 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -3942,7 +3942,7 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v3 @@ -4071,7 +4071,7 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -4109,7 +4109,7 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 @@ -4145,7 +4145,7 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -4181,7 +4181,7 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -4218,7 +4218,7 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -4252,7 +4252,7 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX6: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -4296,7 +4296,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB15_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 @@ -4351,7 +4351,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB15_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 @@ -4401,7 +4401,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v3 @@ -4535,7 +4535,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -4574,7 +4574,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 @@ -4611,7 +4611,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -4648,7 +4648,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 @@ -4686,7 +4686,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 @@ -4721,7 +4721,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6: .LBB15_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 @@ -4764,7 +4764,7 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4816,7 +4816,7 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4864,7 +4864,7 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4990,7 +4990,7 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v3, v3 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5027,7 +5027,7 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5062,7 +5062,7 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5097,7 +5097,7 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5133,7 +5133,7 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -5165,7 +5165,7 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 @@ -5207,7 +5207,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -5260,7 +5260,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -5308,7 +5308,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff -; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5437,7 +5437,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5475,7 +5475,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5511,7 +5511,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5547,7 +5547,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -5584,7 +5584,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -5617,7 +5617,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v1 ; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 @@ -5652,7 +5652,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB18_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 @@ -5696,7 +5696,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB18_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 @@ -5737,7 +5737,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX942: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, v1 @@ -5842,7 +5842,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -5875,7 +5875,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 @@ -5906,7 +5906,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 @@ -5936,7 +5936,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 @@ -5967,7 +5967,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -5994,7 +5994,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX6: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v1 @@ -6028,7 +6028,7 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -6070,7 +6070,7 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -6109,7 +6109,7 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX942: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -6209,7 +6209,7 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -6241,7 +6241,7 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX90A: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -6271,7 +6271,7 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -6300,7 +6300,7 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -6330,7 +6330,7 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 offset:65534 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -6356,7 +6356,7 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX6: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -6393,7 +6393,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 @@ -6420,7 +6420,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX942: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v2 @@ -6441,7 +6441,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 @@ -6466,7 +6466,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 @@ -6489,7 +6489,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX90A: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 @@ -6510,7 +6510,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v2 @@ -6532,7 +6532,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 @@ -6564,7 +6564,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -6606,7 +6606,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX6: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -6648,7 +6648,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 @@ -6675,7 +6675,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX942: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v2 @@ -6696,7 +6696,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 @@ -6721,7 +6721,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX10: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 @@ -6744,7 +6744,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX90A: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 @@ -6765,7 +6765,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v2 @@ -6787,7 +6787,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 @@ -6819,7 +6819,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX7: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -6862,7 +6862,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX6: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -6903,7 +6903,7 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] @@ -6928,7 +6928,7 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX942: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] @@ -6948,7 +6948,7 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] @@ -6971,7 +6971,7 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] @@ -6993,7 +6993,7 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] @@ -7013,7 +7013,7 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] @@ -7034,7 +7034,7 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -7065,7 +7065,7 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -7106,7 +7106,7 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -7146,7 +7146,7 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] @@ -7171,7 +7171,7 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX942: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] @@ -7191,7 +7191,7 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] @@ -7214,7 +7214,7 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] @@ -7236,7 +7236,7 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] @@ -7256,7 +7256,7 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] @@ -7277,7 +7277,7 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -7308,7 +7308,7 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -7350,7 +7350,7 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -7397,7 +7397,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB24_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 @@ -7451,7 +7451,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB24_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 @@ -7501,7 +7501,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX942: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 @@ -7635,7 +7635,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -7676,7 +7676,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 @@ -7715,7 +7715,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 @@ -7753,7 +7753,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 @@ -7800,7 +7800,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7839,7 +7839,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7879,7 +7879,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB25_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 @@ -7933,7 +7933,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB25_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 @@ -7983,7 +7983,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX942: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 @@ -8117,7 +8117,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 @@ -8158,7 +8158,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 @@ -8197,7 +8197,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 @@ -8235,7 +8235,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 @@ -8282,7 +8282,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -8322,7 +8322,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -8362,7 +8362,7 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -8413,7 +8413,7 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -8461,7 +8461,7 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8589,7 +8589,7 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8629,7 +8629,7 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8667,7 +8667,7 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8704,7 +8704,7 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -8749,7 +8749,7 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -8786,7 +8786,7 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -8825,7 +8825,7 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -8876,7 +8876,7 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -8924,7 +8924,7 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: s_movk_i32 s4, 0x7fff ; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 -; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -9052,7 +9052,7 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -9092,7 +9092,7 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -9130,7 +9130,7 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -9167,7 +9167,7 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 @@ -9212,7 +9212,7 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -9250,7 +9250,7 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -9292,7 +9292,7 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v1, v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 @@ -9319,7 +9319,7 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v1, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX942: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, v1 @@ -9340,7 +9340,7 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v1, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, v1 @@ -9365,7 +9365,7 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -9388,7 +9388,7 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v1, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 @@ -9409,7 +9409,7 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v1, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 @@ -9431,7 +9431,7 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 @@ -9453,7 +9453,7 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 @@ -9475,7 +9475,7 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v1 @@ -9504,7 +9504,7 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v1, v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -9529,7 +9529,7 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v1, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX942: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -9549,7 +9549,7 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v1, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -9572,7 +9572,7 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v1, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -9594,7 +9594,7 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v1, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -9614,7 +9614,7 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v1, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -9635,7 +9635,7 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v1, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -9656,7 +9656,7 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1 @@ -9677,7 +9677,7 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v1, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 5f0ca7bc42ae0..99f5ca968d6ee 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -28,6 +28,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_mov_b32 s4, 0 ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: .p2align ; MUBUF-NEXT: .LBB0_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0x3000 @@ -65,6 +66,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s0, 0 +; FLATSCR-NEXT: .p2align ; FLATSCR-NEXT: .LBB0_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLATSCR-NEXT: s_add_i32 s1, s0, 0x3000 @@ -121,6 +123,7 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_add_i32 s32, s32, 0x200000 ; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], s33 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: .p2align ; MUBUF-NEXT: .LBB1_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 ; MUBUF-NEXT: v_lshrrev_b32_e64 v5, 6, s33 @@ -166,6 +169,7 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: scratch_store_dword off, v2, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s0, 0 +; FLATSCR-NEXT: .p2align ; FLATSCR-NEXT: .LBB1_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLATSCR-NEXT: s_add_i32 s1, s33, s0 @@ -216,6 +220,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; MUBUF-NEXT: s_mov_b32 s4, 0 ; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: .p2align ; MUBUF-NEXT: .LBB2_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 @@ -297,6 +302,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:1024 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: .p2align ; FLATSCR-NEXT: .LBB2_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLATSCR-NEXT: s_add_i32 s1, s0, 0x2000 diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll index 4d751f2605c39..fbfa02a6907ed 100644 --- a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll +++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll @@ -210,6 +210,7 @@ define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: long_backward_sbranch: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB4_1: ; %bb2 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_add_i32 s0, s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll index 0ce3742bb0e83..c0376d2279115 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll @@ -13,6 +13,7 @@ define <3 x float> @liveout_undef_subrange(<3 x float> %arg) { ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: ; kill: killed $vgpr1 ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_cmp_neq_f32_e32 vcc, 0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll index 2445925b89bef..2c354b47a817d 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll @@ -23,6 +23,7 @@ define void @loop_on_argument(i1 %arg) { ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_and_b64 s[6:7], exec, vcc diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 702a69f776de3..0b16de5723bfb 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -14,6 +14,7 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB0_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -40,6 +41,7 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 +; GFX12-SPREFETCH-NEXT: .p2align ; GFX12-SPREFETCH-NEXT: .LBB0_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe @@ -68,6 +70,7 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB0_2: ; %for.body ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: flat_load_b128 v[2:5], v0, s[2:3] offset:-176 @@ -114,6 +117,7 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB1_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 @@ -138,6 +142,7 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp ; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 +; GFX12-SPREFETCH-NEXT: .p2align ; GFX12-SPREFETCH-NEXT: .LBB1_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SPREFETCH-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 @@ -163,6 +168,7 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB1_2: ; %for.body ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: global_load_b128 v[2:5], v0, s[2:3] offset:-176 @@ -207,6 +213,7 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr ; GFX12-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB2_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -232,6 +239,7 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr ; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SPREFETCH-NEXT: .p2align ; GFX12-SPREFETCH-NEXT: .LBB2_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 @@ -258,6 +266,7 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr ; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB2_2: ; %for.body ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -302,6 +311,7 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s2, 0 ; GFX12-NEXT: s_cbranch_scc1 .LBB3_2 +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB3_1: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -327,6 +337,7 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 ; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s2, 0 ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_2 +; GFX12-SPREFETCH-NEXT: .p2align ; GFX12-SPREFETCH-NEXT: .LBB3_1: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe @@ -352,6 +363,7 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_cmp_eq_u32 s2, 0 ; GFX1250-NEXT: s_cbranch_scc1 .LBB3_2 +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB3_1: ; %for.body ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v4, s0 @@ -408,6 +420,7 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB4_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 @@ -446,6 +459,7 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff ; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1 ; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-SPREFETCH-NEXT: .p2align ; GFX12-SPREFETCH-NEXT: .LBB4_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 @@ -480,6 +494,7 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3] +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB4_2: ; %for.body ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 @@ -539,6 +554,7 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-NEXT: .p2align ; GFX12-NEXT: .LBB5_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176 @@ -577,6 +593,7 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff ; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1 ; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-SPREFETCH-NEXT: .p2align ; GFX12-SPREFETCH-NEXT: .LBB5_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SPREFETCH-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176 @@ -611,6 +628,7 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3] +; GFX1250-NEXT: .p2align ; GFX1250-NEXT: .LBB5_2: ; %for.body ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176 diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll index 595a78ca0c08c..308b1c96d7585 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll @@ -1,15 +1,20 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -asm-verbose=0 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10-ASM %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s -filetype=obj | llvm-objdump -d --arch-name=amdgcn --mcpu=gfx1030 --symbolize-operands - | FileCheck --check-prefixes=GCN,GFX10,GFX10-DIS %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s + +; GFX9-LABEL: test_loop_64 +; GFX9: .p2align 5 ; GFX8-NOT: s_inst_prefetch ; GFX8-NOT: .palign 6 ; GCN-LABEL: test_loop_64 ; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 -; GFX10-DIS-NEXT: {{^$}} +; GFX10-ASM-NEXT: p2align 5 +; GFX10-DIS-NEXT: s_nop 0 ; GFX10-ASM-NEXT: [[L1:.LBB[0-9_]+]]: -; GFX10-DIS-NEXT: <[[L1:L[0-9]+]]>: +; GFX10-DIS: <[[L1:L[0-9]+]]>: ; GFX10: s_sleep 0 ; GFX10: s_cbranch_scc0 [[L1]] ; GFX10-NEXT: s_endpgm @@ -28,6 +33,9 @@ bb2: ; preds = %bb2, %bb br i1 %tmp3, label %bb1, label %bb2 } +; GFX9-LABEL: test_loop_128 +; GFX9: .p2align 4 + ; GCN-LABEL: test_loop_128 ; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 ; GFX10-ASM-NEXT: .p2align 6 @@ -68,6 +76,9 @@ bb2: ; preds = %bb2, %bb br i1 %tmp3, label %bb1, label %bb2 } +; GFX9-LABEL: test_loop_192 +; GFX9: .p2align 4 + ; GCN-LABEL: test_loop_192 ; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 ; GFX10-NEXT: s_inst_prefetch 0x1 @@ -128,11 +139,15 @@ bb2: ; preds = %bb2, %bb br i1 %tmp3, label %bb1, label %bb2 } +; GFX9-LABEL: test_loop_256 +; GFX9: .p2align 4 + ; GCN-LABEL: test_loop_256 ; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 -; GFX10-DIS-NEXT: {{^$}} +; GFX10-ASM-NEXT: p2align 4 +; GFX10-DIS-NEXT: s_nop 0 ; GFX10-ASM-NEXT: [[L1:.LBB[0-9_]+]]: -; GFX10-DIS-NEXT: <[[L1:L[0-9]+]]>: +; GFX10-DIS: <[[L1:L[0-9]+]]>: ; GFX10: s_sleep 0 ; GFX10: s_cbranch_scc0 [[L1]] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll index a33255a0acbb9..8565510f09a8f 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll @@ -13,6 +13,7 @@ define void @needs_and(i32 %arg) { ; GCN-NEXT: s_mov_b32 s10, 1 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_branch .LBB0_2 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_1: ; %endif ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] @@ -67,6 +68,7 @@ define void @doesnt_need_and(i32 %arg) { ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB1_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_add_i32 s6, s6, 1 @@ -104,6 +106,7 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) { ; GCN-NEXT: s_mov_b32 s10, 1 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_branch .LBB2_2 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB2_1: ; %endif ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir b/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir index 05cfe53224582..47c7520a69c7e 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir +++ b/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir @@ -16,7 +16,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: S_BRANCH %bb.2 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: bb.1 (align 64): + ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.7(0x04000000), %bb.2(0x7c000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll index 34a9624cb19eb..852453de5bb32 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll @@ -72,6 +72,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: s_inst_prefetch 0x2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_mov_b32 vcc_lo, exec_lo +; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_9: ; %for.body159 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_9 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 680942fcb4d4b..bdfbd9ddcc3a3 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -122,6 +122,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0 +; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44 ; CHECK-NEXT: s_add_i32 s5, s5, 1 @@ -161,6 +162,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s69, 0 ; CHECK-NEXT: s_mov_b32 s80, 0 ; CHECK-NEXT: s_branch .LBB0_8 +; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 ; CHECK-NEXT: s_add_i32 s80, s80, 4 @@ -372,6 +374,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; %bb.26: ; CHECK-NEXT: s_mov_b32 s52, 0 ; CHECK-NEXT: s_branch .LBB0_28 +; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -889,6 +892,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB1_3: ; %.53 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll b/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll index 4c2967a52fe93..61548da1c13bf 100644 --- a/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll @@ -26,6 +26,7 @@ define protected amdgpu_kernel void @_RSENC_PRInit______________________________ ; CHECK-NEXT: s_cbranch_vccnz .LBB0_13 ; CHECK-NEXT: ; %bb.2: ; %lor.lhs.false17 ; CHECK-NEXT: s_cmp_eq_u32 s4, 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_3: ; %while.cond.i ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_3 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index c92c672dda2ad..dddbc7e1efb4e 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -12,6 +12,7 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-NEXT: v_add_co_u32_e32 v8, vcc, s16, v4 ; CHECK-NEXT: v_addc_co_u32_e32 v9, vcc, v6, v5, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_1: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v7, s5 @@ -41,6 +42,7 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 ; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: ; %bb.6: ; %loop-memcpy-residual ; CHECK-NEXT: s_add_u32 s6, 32, s4 ; CHECK-NEXT: s_addc_u32 s7, 0, s5 @@ -87,6 +89,7 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 ; CHECK-NEXT: s_mov_b64 s[10:11], 0 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_13: ; %loop-memcpy-expansion2 ; CHECK-NEXT: ; Parent Loop BB0_11 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 @@ -117,6 +120,7 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 ; CHECK-NEXT: s_mov_b64 s[14:15], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_17: ; %loop-memcpy-residual4 ; CHECK-NEXT: ; Parent Loop BB0_11 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 @@ -173,6 +177,7 @@ define void @issue63986_reduced_expanded(i64 %idxprom) { ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB1_6: ; %loop-memcpy-residual ; CHECK-NEXT: s_add_i32 s6, s8, 1 ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] @@ -188,6 +193,7 @@ define void @issue63986_reduced_expanded(i64 %idxprom) { ; CHECK-NEXT: v_mov_b32_e32 v5, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB1_9: ; %loop-memcpy-expansion2 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccz .LBB1_9 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 4c0ab91b7d622..452036257adfb 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -14,6 +14,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_1: ; %load-store-loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 @@ -90,7 +91,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; ALIGNED-NEXT: .LBB0_1: ; %load-store-loop +; ALIGNED: .LBB0_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo @@ -771,6 +772,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB1_1: ; %load-store-loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 @@ -846,7 +848,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; ALIGNED-NEXT: .LBB1_1: ; %load-store-loop +; ALIGNED: .LBB1_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo @@ -1524,6 +1526,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB2_1: ; %load-store-loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 @@ -1592,6 +1595,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: .p2align ; ALIGNED-NEXT: .LBB2_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 @@ -2133,6 +2137,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB3_1: ; %load-store-loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e @@ -2389,7 +2394,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill -; ALIGNED-NEXT: .LBB3_1: ; %load-store-loop +; ALIGNED: .LBB3_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x34 ; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:255 @@ -3496,6 +3501,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; UNROLL3-NEXT: v_mov_b32_e32 v2, v1 ; UNROLL3-NEXT: v_mov_b32_e32 v3, v0 ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: .p2align ; UNROLL3-NEXT: .LBB3_1: ; %load-store-loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb @@ -3580,6 +3586,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB4_1: ; %load-store-loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e @@ -3739,6 +3746,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill +; ALIGNED-NEXT: .p2align ; ALIGNED-NEXT: .LBB4_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x39 @@ -5419,6 +5427,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: s_cbranch_execz .LBB5_3 ; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB5_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 @@ -5485,6 +5494,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: s_movk_i32 s6, 0xff00 ; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 ; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB5_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 @@ -5566,6 +5576,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_cbranch_execz .LBB5_3 ; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: .p2align ; ALIGNED-NEXT: .LBB5_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 @@ -6195,6 +6206,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 ; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: .p2align ; ALIGNED-NEXT: .LBB5_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 @@ -6923,6 +6935,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; CHECK-NEXT: s_cbranch_execz .LBB6_3 ; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB6_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 @@ -6989,6 +7002,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; CHECK-NEXT: s_movk_i32 s6, 0xff00 ; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 ; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB6_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 @@ -7069,6 +7083,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: s_cbranch_execz .LBB6_3 ; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: .p2align ; ALIGNED-NEXT: .LBB6_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 @@ -7696,6 +7711,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 ; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: .p2align ; ALIGNED-NEXT: .LBB6_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 @@ -8421,6 +8437,7 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: s_cbranch_execz .LBB7_3 ; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB7_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 @@ -8487,6 +8504,7 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: s_movk_i32 s6, 0xff00 ; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 ; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB7_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 @@ -8560,6 +8578,7 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: s_cbranch_execz .LBB7_3 ; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: .p2align ; ALIGNED-NEXT: .LBB7_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 @@ -9058,6 +9077,7 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 ; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: .p2align ; ALIGNED-NEXT: .LBB7_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 @@ -9647,6 +9667,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: s_cbranch_execz .LBB8_3 ; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader ; CHECK-NEXT: s_mov_b64 s[4:5], 0x800 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB8_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e @@ -9855,6 +9876,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0 ; CHECK-NEXT: s_movk_i32 s4, 0xf800 ; CHECK-NEXT: s_mov_b32 s5, -1 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB8_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e @@ -10116,6 +10138,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_cbranch_execz .LBB8_3 ; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800 +; ALIGNED-NEXT: .p2align ; ALIGNED-NEXT: .LBB8_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255 @@ -11171,6 +11194,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0 ; ALIGNED-NEXT: s_movk_i32 s4, 0xf800 ; ALIGNED-NEXT: s_mov_b32 s5, -1 +; ALIGNED-NEXT: .p2align ; ALIGNED-NEXT: .LBB8_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255 @@ -12283,6 +12307,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: v_mov_b32_e32 v2, v1 ; UNROLL3-NEXT: v_mov_b32_e32 v3, v0 ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0 +; UNROLL3-NEXT: .p2align ; UNROLL3-NEXT: .LBB8_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb @@ -12391,6 +12416,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2020 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: .p2align ; UNROLL3-NEXT: .LBB8_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb @@ -12455,6 +12481,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v3 ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB9_2 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB9_1: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e @@ -12559,6 +12586,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: s_movk_i32 s6, 0xff00 ; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 ; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB9_4: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e @@ -12724,6 +12752,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v0 ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB9_2 +; ALIGNED-NEXT: .p2align ; ALIGNED-NEXT: .LBB9_1: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x39 @@ -14294,6 +14323,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 ; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: .p2align ; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x3a diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index dd5c247f6ef35..b81de24086b4a 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -1054,6 +1054,7 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v2 ; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB7_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[10:13], v9 @@ -1081,6 +1082,7 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB7_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v7, v2 @@ -1298,6 +1300,7 @@ define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB9_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen @@ -1482,6 +1485,7 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB11_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 @@ -1509,6 +1513,7 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB11_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 @@ -1562,6 +1567,7 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_mov_b32_e32 v3, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB12_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[9:12], v3 @@ -1584,6 +1590,7 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB12_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v2, v1 @@ -1613,6 +1620,7 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v6, v0, v7 ; CHECK-NEXT: v_add_nc_u32_e32 v7, v1, v7 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB12_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v8, v7 @@ -1638,6 +1646,7 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5 ; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4 ; CHECK-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB12_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[5:8], v4 @@ -1676,6 +1685,7 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB13_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 @@ -1703,6 +1713,7 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB13_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 @@ -1774,6 +1785,7 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB14_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen @@ -1993,6 +2005,7 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB16_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 @@ -2062,6 +2075,7 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB17_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v2, v1 @@ -2131,6 +2145,7 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB18_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 @@ -2215,6 +2230,7 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB19_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen @@ -2244,6 +2260,7 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v6, v0, v7 ; CHECK-NEXT: v_add_nc_u32_e32 v7, v1, v7 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB19_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v8, v7, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 6110b3101020a..9bfd01aa00cf4 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -45,6 +45,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 @@ -137,6 +138,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -197,6 +199,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -279,6 +282,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 @@ -372,6 +376,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -433,6 +438,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -509,6 +515,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 @@ -601,6 +608,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -661,6 +669,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -805,6 +814,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 @@ -929,6 +939,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1021,6 +1032,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1097,6 +1109,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 @@ -1190,6 +1203,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1251,6 +1265,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1366,6 +1381,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 @@ -1461,6 +1477,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1524,6 +1541,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1636,6 +1654,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 @@ -1732,6 +1751,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1796,6 +1816,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1879,6 +1900,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB7_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] @@ -1940,6 +1962,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB7_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] @@ -1969,6 +1992,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB7_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] @@ -2087,6 +2111,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_accvgpr_write_b32 a29, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 0 @@ -2183,6 +2208,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -2247,6 +2273,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -2365,6 +2392,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB9_2 Depth 2 ; GFX908-NEXT: s_mov_b32 s1, 16 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB9_2: ; %inner.for.cond.preheader ; GFX908-NEXT: ; Parent Loop BB9_1 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -2467,6 +2495,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2 ; GFX90A-NEXT: s_mov_b32 s1, 16 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB9_2: ; %inner.for.cond.preheader ; GFX90A-NEXT: ; Parent Loop BB9_1 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 @@ -2537,6 +2566,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB9_2 Depth 2 ; GFX942-NEXT: s_mov_b32 s1, 16 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB9_2: ; %inner.for.cond.preheader ; GFX942-NEXT: ; Parent Loop BB9_1 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll index 60f77bda6d50e..b4c84ea62fc74 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll @@ -31,6 +31,7 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400 ; GCN-NEXT: s_load_dword s4, s[4:5], 0xf ; GCN-NEXT: s_mov_b64 s[2:3], 0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -90,6 +91,7 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:400 ; GCN-NEXT: s_load_dword s6, s[4:5], 0xf ; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB1_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index f4e5c276b8b75..60c2095cf5c43 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -16,6 +16,7 @@ define void @lsr_order_mul24_0(i32 %arg, i32 %arg2, i32 %arg6, i32 %arg13, i32 % ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write_b32 v0, v5 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB0_1: ; %bb23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v0, v2 @@ -70,6 +71,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: s_mov_b64 s[10:11], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB1_2: ; %bb23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll index 1fad8f37cc28c..92e0094a86747 100644 --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -63,6 +63,7 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) { ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_branch .LBB0_4 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_3: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 ; GCN-NEXT: s_or_b64 exec, exec, s[10:11] @@ -172,6 +173,7 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_branch .LBB1_2 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB1_1: ; %Flow4 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_and_b64 s[4:5], exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll index 5ce30cbc8c015..e4e416d316362 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -19,6 +19,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) captu ; GCN-NEXT: ds_read_b64 v[0:1], v0 ; GCN-NEXT: s_and_b64 s[0:1], exec, 0 ; GCN-NEXT: s_branch .LBB0_2 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_1: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: s_mov_b64 vcc, s[0:1] @@ -39,6 +40,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) captu ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_read_b32 v0, v0 ; GCN-NEXT: s_and_b64 vcc, exec, 0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_6: ; %bb9 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_mov_b64 vcc, vcc @@ -170,6 +172,7 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) captures(none ; GCN-NEXT: s_mov_b64 s[0:1], -1 ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB1_4: ; %bb18 ; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll index 306703bd61806..5ad6aadff039c 100644 --- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll @@ -61,6 +61,7 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: ; implicit-def: $sgpr2 ; GFX12-NEXT: s_branch .LBB0_2 +; GFX12-NEXT: .p2align 4 ; GFX12-NEXT: .LBB0_1: ; %Flow ; GFX12-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index cf244f0b1f884..3121183607176 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -16,6 +16,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GFX942-NEXT: s_branch .LBB0_2 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX942-NEXT: s_or_b32 s4, s3, 1 @@ -52,6 +53,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX908-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GFX908-NEXT: s_branch .LBB0_2 +; GFX908-NEXT: .p2align ; GFX908-NEXT: .LBB0_1: ; %bb2 ; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX908-NEXT: s_or_b32 s4, s3, 1 diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll index 88cc06d8b3832..e015c04ecf499 100644 --- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll @@ -284,6 +284,7 @@ define amdgpu_kernel void @no_clobbering_loop1(ptr addrspace(1) %arg, i1 %cc) { ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB4_1: ; %while.cond ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_load_dword s5, s[2:3], 0x4 @@ -338,6 +339,7 @@ define amdgpu_kernel void @no_clobbering_loop2(ptr addrspace(1) noalias %arg, pt ; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB5_1: ; %while.cond ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_load_dword s5, s[0:1], 0x0 @@ -402,6 +404,7 @@ define amdgpu_kernel void @clobbering_loop(ptr addrspace(1) %arg, ptr addrspace( ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB6_1: ; %while.cond ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: global_load_dword v1, v0, s[8:9] offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index 0887f41b7db97..d06f401ec85e0 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -29,6 +29,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 ; GCN-NEXT: s_mov_b32 s12, s6 ; GCN-NEXT: s_branch .LBB0_4 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_3: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 ; GCN-NEXT: s_andn2_b64 vcc, exec, s[16:17] @@ -42,6 +43,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: ; %bb.5: ; in Loop: Header=BB0_4 Depth=2 ; GCN-NEXT: s_mov_b64 s[16:17], s[2:3] ; GCN-NEXT: s_branch .LBB0_7 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_6: ; %bb3 ; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 ; GCN-NEXT: s_add_i32 s12, s12, 1 @@ -105,6 +107,7 @@ define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1) ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s3, s6 ; GCN-NEXT: s_branch .LBB1_2 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB1_1: ; %bb7 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 @@ -125,6 +128,7 @@ define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1) ; GCN-NEXT: s_mov_b64 vcc, exec ; GCN-NEXT: s_cbranch_execnz .LBB1_1 ; GCN-NEXT: s_branch .LBB1_5 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB1_4: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: ; implicit-def: $sgpr2 ; GCN-NEXT: s_mov_b64 vcc, 0 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index b1e05158b6212..ca4e7243e4050 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -384,6 +384,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -513,6 +514,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: s_mov_b32 s6, 0 +; GFX900-NEXT: .p2align ; GFX900-NEXT: .LBB1_2: ; %for.body ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 @@ -625,6 +627,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB1_2: ; %for.body ; GFX10-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 @@ -740,6 +743,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_mov_b32 s4, 0 +; GFX90A-NEXT: .p2align ; GFX90A-NEXT: .LBB1_2: ; %for.body ; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 @@ -844,6 +848,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB1_2: ; %for.body ; GFX11-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 @@ -2586,6 +2591,7 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX8-NEXT: .p2align ; GFX8-NEXT: .LBB8_1: ; %branch ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_and_b64 s[2:3], exec, vcc @@ -2605,6 +2611,7 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB8_1: ; %branch ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_and_b64 s[2:3], exec, vcc @@ -2625,6 +2632,7 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB8_1: ; %branch ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_and_b32 s1, exec_lo, vcc_lo @@ -2644,6 +2652,7 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11-TRUE16-NEXT: .p2align ; GFX11-TRUE16-NEXT: .LBB8_1: ; %branch ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_and_b32 s1, exec_lo, vcc_lo @@ -2664,6 +2673,7 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX11-FAKE16-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: .p2align ; GFX11-FAKE16-NEXT: .LBB8_1: ; %branch ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_and_b32 s1, exec_lo, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index f0c8fed925673..66e3070b510f8 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -149,6 +149,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v19, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 31, v15 @@ -1624,6 +1625,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v21, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll index cefcd7e0d2651..d224c9809cbc3 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll @@ -95,7 +95,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi_loop(ptr addrspace ; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] ; CHECK-NEXT: v_mov_b32_e32 v64, 4.0 ; CHECK-NEXT: v_mov_b32_e32 v65, 2.0 -; CHECK-NEXT: .LBB1_1: ; %loop +; CHECK: .LBB1_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_nop 7 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 4addf42b27984..02caa24f6342e 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -188,6 +188,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 @@ -412,6 +413,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 @@ -1262,6 +1264,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 @@ -1456,6 +1459,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 @@ -1651,6 +1655,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 @@ -1749,6 +1754,7 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 19f0e93c308d8..4d2722c25f075 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -2015,6 +2015,7 @@ define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { ; NOSDWA: ; %bb.0: ; %bb ; NOSDWA-NEXT: s_mov_b32 s0, 0xffff ; NOSDWA-NEXT: s_and_b64 vcc, exec, -1 +; NOSDWA-NEXT: .p2align ; NOSDWA-NEXT: .LBB21_1: ; %bb1 ; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1 ; NOSDWA-NEXT: ;;#ASMSTART @@ -2032,6 +2033,7 @@ define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { ; GFX89: ; %bb.0: ; %bb ; GFX89-NEXT: s_mov_b32 s0, 0xffff ; GFX89-NEXT: s_and_b64 vcc, exec, -1 +; GFX89-NEXT: .p2align ; GFX89-NEXT: .LBB21_1: ; %bb1 ; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX89-NEXT: ;;#ASMSTART @@ -2049,6 +2051,7 @@ define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_and_b64 vcc, exec, -1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB21_1: ; %bb1 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: ;;#ASMSTART @@ -2066,6 +2069,7 @@ define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB21_1: ; %bb1 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: ;;#ASMSTART @@ -2104,6 +2108,7 @@ define void @crash_lshlrevb16_not_reg_op() { ; NOSDWA-NEXT: s_and_b32 s6, s4, 0x1ff ; NOSDWA-NEXT: s_mov_b64 s[4:5], 0 ; NOSDWA-NEXT: s_and_b64 vcc, exec, -1 +; NOSDWA-NEXT: .p2align ; NOSDWA-NEXT: .LBB22_1: ; %bb1 ; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1 ; NOSDWA-NEXT: s_lshl_b32 s7, s4, 3 @@ -2126,6 +2131,7 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX89-NEXT: s_and_b32 s6, s4, 0x1ff ; GFX89-NEXT: s_mov_b64 s[4:5], 0 ; GFX89-NEXT: s_and_b64 vcc, exec, -1 +; GFX89-NEXT: .p2align ; GFX89-NEXT: .LBB22_1: ; %bb1 ; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX89-NEXT: s_lshl_b32 s7, s4, 3 @@ -2148,6 +2154,7 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX9-NEXT: s_and_b32 s6, s4, 0x1ff ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_and_b64 vcc, exec, -1 +; GFX9-NEXT: .p2align ; GFX9-NEXT: .LBB22_1: ; %bb1 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_lshl_b32 s7, s4, 3 @@ -2170,6 +2177,7 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo ; GFX10-NEXT: s_and_b32 s6, s4, 0x1ff ; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: .p2align ; GFX10-NEXT: .LBB22_1: ; %bb1 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_lshl_b32 s7, s4, 3 diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll index f497752994852..e24ff872c7823 100644 --- a/llvm/test/CodeGen/AMDGPU/select-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll @@ -63,6 +63,7 @@ define amdgpu_kernel void @undef_v6f32(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB4_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[6:9], v0 @@ -107,6 +108,7 @@ define amdgpu_kernel void @undef_v6i32(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB5_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[6:9], v0 @@ -152,6 +154,7 @@ define amdgpu_kernel void @undef_v5f32(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB6_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[5:8], v0 @@ -195,6 +198,7 @@ define amdgpu_kernel void @undef_v5i32(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB7_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[5:8], v0 @@ -240,6 +244,7 @@ define amdgpu_kernel void @undef_v3f64(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB8_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[7:10], v6 @@ -283,6 +288,7 @@ define amdgpu_kernel void @undef_v3i64(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_mov_b32_e32 v6, s4 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB9_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[7:10], v6 @@ -329,6 +335,7 @@ define amdgpu_kernel void @undef_v4f16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB10_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b64 v[3:4], v2 @@ -367,6 +374,7 @@ define amdgpu_kernel void @undef_v4i16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB11_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b64 v[3:4], v2 @@ -406,6 +414,7 @@ define amdgpu_kernel void @undef_v2f16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 ; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB12_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b32 v2, v0 @@ -443,6 +452,7 @@ define amdgpu_kernel void @undef_v2i16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 ; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB13_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b32 v2, v0 @@ -503,6 +513,7 @@ define amdgpu_kernel void @undef_bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB15_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_u16 v1, v0 @@ -541,6 +552,7 @@ define amdgpu_kernel void @undef_v2bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB16_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b32 v1, v0 @@ -580,6 +592,7 @@ define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB17_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b32 v2, v0 @@ -622,6 +635,7 @@ define amdgpu_kernel void @undef_v4bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB18_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b64 v[2:3], v0 @@ -662,6 +676,7 @@ define amdgpu_kernel void @undef_v6bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB19_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b64 v[3:4], v0 @@ -705,6 +720,7 @@ define amdgpu_kernel void @undef_v8bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB20_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[4:7], v0 @@ -747,6 +763,7 @@ define amdgpu_kernel void @undef_v16bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB21_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[8:11], v0 @@ -796,6 +813,7 @@ define amdgpu_kernel void @undef_v32bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB22_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[16:19], v0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll index 192bd2073886a..89b6324b6b740 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll @@ -25,6 +25,7 @@ define amdgpu_kernel void @copy_to_vreg_1(i32 %0) { ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: s_branch .LBB0_3 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_1: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll index 98c4868e213db..da7b73e41e8a0 100644 --- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll @@ -9,6 +9,7 @@ define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 i ; GCN-NEXT: s_mov_b32 s7, 0 ; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 3, v1 ; GCN-NEXT: s_branch .LBB0_2 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_1: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll index e8da10c32f5d4..204237276a1fc 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -12,6 +12,7 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB0_1: ; %ENDIF ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_and_b64 s[2:3], exec, vcc @@ -37,6 +38,7 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, ; FLAT-NEXT: v_and_b32_e32 v0, 1, v0 ; FLAT-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; FLAT-NEXT: s_mov_b64 s[0:1], 0 +; FLAT-NEXT: .p2align ; FLAT-NEXT: .LBB0_1: ; %ENDIF ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_and_b64 s[2:3], exec, vcc @@ -83,6 +85,7 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec ; SI-NEXT: .LBB1_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: .p2align ; SI-NEXT: .LBB1_3: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_and_b64 s[4:5], exec, s[2:3] @@ -108,6 +111,7 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; FLAT-NEXT: s_and_b64 s[2:3], s[2:3], exec ; FLAT-NEXT: .LBB1_2: ; %endif ; FLAT-NEXT: s_or_b64 exec, exec, s[6:7] +; FLAT-NEXT: .p2align ; FLAT-NEXT: .LBB1_3: ; %loop ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_and_b64 s[4:5], exec, s[2:3] @@ -177,6 +181,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 ; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.2: ; %for.body ; SI-NEXT: s_and_b64 vcc, exec, 0 +; SI-NEXT: .p2align ; SI-NEXT: .LBB3_3: ; %self.loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_mov_b64 vcc, vcc @@ -197,6 +202,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 ; FLAT-NEXT: s_cbranch_scc0 .LBB3_4 ; FLAT-NEXT: ; %bb.2: ; %for.body ; FLAT-NEXT: s_and_b64 vcc, exec, 0 +; FLAT-NEXT: .p2align ; FLAT-NEXT: .LBB3_3: ; %self.loop ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_mov_b64 vcc, vcc diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll index 5d5e35f86890c..554764cde2dce 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll @@ -15,6 +15,7 @@ define amdgpu_kernel void @test(i32 %arg, i32 %arg1) { ; CHECK-NEXT: s_cbranch_vccnz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb9 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_2: ; %bb10 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_mov_b64 vcc, vcc diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll index 34de1e48bfb59..dfc926c2833ba 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -33,6 +33,7 @@ define void @nested_inf_loop(i1 %0, i1 %1) { ; ISA-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; ISA-NEXT: s_or_b64 exec, exec, s[8:9] ; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: .p2align ; ISA-NEXT: .LBB0_3: ; %BB4 ; ISA-NEXT: ; Parent Loop BB0_1 Depth=1 ; ISA-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll b/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll index a5299ea36958d..20b38393dafc4 100644 --- a/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll +++ b/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll @@ -63,6 +63,7 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias nocapture readonly %arg, ; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_4: ; %bb21 ; CHECK-NEXT: ; Parent Loop BB0_1 Depth=1 ; CHECK-NEXT: ; Parent Loop BB0_3 Depth=2 @@ -72,6 +73,7 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias nocapture readonly %arg, ; CHECK-NEXT: s_mov_b64 vcc, s[2:3] ; CHECK-NEXT: s_cbranch_vccz .LBB0_4 ; CHECK-NEXT: s_branch .LBB0_2 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_5: ; %bb31 ; CHECK-NEXT: ; Parent Loop BB0_1 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index b21c781f6223a..eabf983215bf9 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -940,6 +940,7 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; SI-NEXT: ; %bb.1: ; %bb.preheader ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: .p2align ; SI-NEXT: .LBB10_2: ; %bb ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: ;;#ASMSTART @@ -985,6 +986,7 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB10_3 +; GFX10-WAVE64-NEXT: .p2align ; GFX10-WAVE64-NEXT: .LBB10_1: ; %bb ; GFX10-WAVE64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-WAVE64-NEXT: ;;#ASMSTART @@ -1028,6 +1030,7 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB10_3 +; GFX10-WAVE32-NEXT: .p2align ; GFX10-WAVE32-NEXT: .LBB10_1: ; %bb ; GFX10-WAVE32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-WAVE32-NEXT: ;;#ASMSTART @@ -1071,6 +1074,7 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11-NEXT: s_cbranch_execz .LBB10_3 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB10_1: ; %bb ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: ;;#ASMSTART @@ -1709,6 +1713,7 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_branch .LBB15_3 +; SI-NEXT: .p2align ; SI-NEXT: .LBB15_2: ; %latch ; SI-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1754,6 +1759,7 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX10-WAVE64-NEXT: s_mov_b32 s6, 0 ; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-WAVE64-NEXT: s_branch .LBB15_3 +; GFX10-WAVE64-NEXT: .p2align ; GFX10-WAVE64-NEXT: .LBB15_2: ; %latch ; GFX10-WAVE64-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1799,6 +1805,7 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX10-WAVE32-NEXT: s_mov_b32 s0, 0 ; GFX10-WAVE32-NEXT: s_mov_b32 s2, 0 ; GFX10-WAVE32-NEXT: s_branch .LBB15_3 +; GFX10-WAVE32-NEXT: .p2align ; GFX10-WAVE32-NEXT: .LBB15_2: ; %latch ; GFX10-WAVE32-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -1844,6 +1851,7 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_branch .LBB15_3 +; GFX11-NEXT: .p2align ; GFX11-NEXT: .LBB15_2: ; %latch ; GFX11-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index e64e3def98c26..981d33fbe104d 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -160,6 +160,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_addc_u32 s11, s5, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 @@ -388,6 +389,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 @@ -1102,6 +1104,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_addc_u32 s13, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 ; GCN-IR-NEXT: s_mov_b32 s3, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 @@ -1381,6 +1384,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 @@ -1573,6 +1577,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 @@ -1766,6 +1771,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 @@ -1870,6 +1876,7 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 9cb22dad86b88..76d00fec1b11c 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -357,6 +357,7 @@ define i32 @needs_align1024_stack_args_used_inside_loop(ptr addrspace(5) nocaptu ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GCN-NEXT: s_branch .LBB10_2 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB10_1: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll index 42436a1b4c279..c3106aa0799bd 100644 --- a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll +++ b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll @@ -99,6 +99,7 @@ define amdgpu_kernel void @test_loop_with_if( ptr %ptr, i1 %cond) #0 { ; GFX900-NEXT: v_mov_b32_e32 v1, s0 ; GFX900-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v3 ; GFX900-NEXT: s_branch .LBB2_2 +; GFX900-NEXT: .p2align ; GFX900-NEXT: .LBB2_1: ; %latch ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GFX900-NEXT: s_or_b64 exec, exec, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll index 4a5dc8f300af3..ad86999d0bbeb 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -27,6 +27,7 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) ; CHECK-NEXT: s_mov_b32 s3, 0x40260000 ; CHECK-NEXT: s_mov_b32 s5, 0x40280000 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0 diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll index 0cf26be3ac24f..724456703bf6c 100644 --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -131,7 +131,7 @@ define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(floa ; GCN-NEXT: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.4.flow: + ; GCN-NEXT: bb.4.flow (align 32): ; GCN-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) ; GCN-NEXT: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 469ea24634f62..8556c81a11742 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -350,6 +350,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, s0 ; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) ; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, ttmp2 +; HSA-TRAP-GFX1100-NEXT: .p2align ; HSA-TRAP-GFX1100-NEXT: .LBB2_3: ; =>This Inner Loop Header: Depth=1 ; HSA-TRAP-GFX1100-NEXT: s_sethalt 5 ; HSA-TRAP-GFX1100-NEXT: s_branch .LBB2_3 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index bc9a3f2389e7e..e40eea40e19ff 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -161,6 +161,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 @@ -354,6 +355,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 @@ -923,6 +925,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 @@ -1101,6 +1104,7 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 @@ -1188,6 +1192,7 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 @@ -1289,6 +1294,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_addc_u32 s3, 0, -1 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 @@ -1387,6 +1393,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll index 31708a9b738db..2a302c23c8afa 100644 --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -59,6 +59,7 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 { ; CHECK-NEXT: s_waitcnt expcnt(1) ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; CHECK-NEXT: .p2align ; CHECK-NEXT: .LBB1_1: ; %bb9 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll index 5108159e7a847..0d4d728d9c805 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -563,6 +563,7 @@ define amdgpu_kernel void @uniform_loop(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: uniform_loop: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-NEXT: .p2align ; SI-NEXT: .LBB10_1: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -575,6 +576,7 @@ define amdgpu_kernel void @uniform_loop(ptr addrspace(1) %out, i32 %a) { ; VI-LABEL: uniform_loop: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dword s0, s[4:5], 0x2c +; VI-NEXT: .p2align ; VI-NEXT: .LBB10_1: ; %loop ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1145,6 +1147,7 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) { ; SI-NEXT: s_and_b64 vcc, exec, 0 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_branch .LBB20_2 +; SI-NEXT: .p2align ; SI-NEXT: .LBB20_1: ; %bb3 ; SI-NEXT: ; in Loop: Header=BB20_2 Depth=1 ; SI-NEXT: v_add_i32_e64 v0, s[4:5], 8, v0 @@ -1172,6 +1175,7 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) { ; VI-NEXT: s_and_b64 vcc, exec, 0 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_branch .LBB20_2 +; VI-NEXT: .p2align ; VI-NEXT: .LBB20_1: ; %bb3 ; VI-NEXT: ; in Loop: Header=BB20_2 Depth=1 ; VI-NEXT: v_add_u32_e64 v0, s[4:5], 8, v0 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll index f001bf0d5e498..6684e12ef14fe 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-select.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll @@ -14,6 +14,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX90A-NEXT: s_mov_b32 s4, 0 ; GFX90A-NEXT: s_mov_b32 s5, 0 ; GFX90A-NEXT: s_mov_b32 s6, 0 +; GFX90A-NEXT: .p2align 4 ; GFX90A-NEXT: .LBB0_1: ; %for.body ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -62,6 +63,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX942-NEXT: s_mov_b32 s4, 0 ; GFX942-NEXT: s_mov_b32 s5, 0 ; GFX942-NEXT: s_mov_b32 s6, 0 +; GFX942-NEXT: .p2align 4 ; GFX942-NEXT: .LBB0_1: ; %for.body ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index 25e8581fb6cdd..873e559af598d 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -313,6 +313,7 @@ define hidden void @blam() { ; GCN-NEXT: v_cmp_neq_f32_e64 s[50:51], 0, v43 ; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000 ; GCN-NEXT: s_branch .LBB1_2 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB1_1: ; %Flow7 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 464dad83f47c9..cc79a9f6066bf 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -160,6 +160,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_addc_u32 s11, s5, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 @@ -363,6 +364,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 @@ -902,6 +904,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB6_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 @@ -1017,6 +1020,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 ; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 @@ -1202,6 +1206,7 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 @@ -1295,6 +1300,7 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: .p2align ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll index 490046ce5b856..f38a9b5872eb9 100644 --- a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll @@ -11,6 +11,7 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align ; GFX11-TRUE16-NEXT: .LBB0_1: ; %loop ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 @@ -30,6 +31,7 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align ; GFX11-FAKE16-NEXT: .LBB0_1: ; %loop ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -55,6 +57,7 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .p2align ; GFX12-TRUE16-NEXT: .LBB0_1: ; %loop ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 @@ -79,6 +82,7 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .p2align ; GFX12-FAKE16-NEXT: .LBB0_1: ; %loop ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll index b46f5f5640b66..2057f11f078d7 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -89,6 +89,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: s_branch .LBB2_2 +; SI-NEXT: p2align ; SI-NEXT: .LBB2_1: ; %if.end ; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; SI-NEXT: s_or_b32 exec_lo, exec_lo, s2 diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index d8264b5a091e1..9e187b947d6a0 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -677,6 +677,7 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: .p2align ; GFX942-NEXT: .LBB12_1: ; %bb.1 ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_and_b64 s[6:7], exec, vcc diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 097154ed23ede..4e7606e24e148 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -360,6 +360,7 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1032-NEXT: s_branch .LBB10_2 +; GFX1032-NEXT: .p2align ; GFX1032-NEXT: .LBB10_1: ; %bb13 ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 @@ -422,6 +423,7 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1064-NEXT: s_branch .LBB10_2 +; GFX1064-NEXT: .p2align ; GFX1064-NEXT: .LBB10_1: ; %bb13 ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 @@ -526,6 +528,7 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032-NEXT: s_mov_b32 s3, 0 ; GFX1032-NEXT: ; implicit-def: $sgpr4 ; GFX1032-NEXT: s_branch .LBB11_4 +; GFX1032-NEXT: .p2align ; GFX1032-NEXT: .LBB11_2: ; %bb8 ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 ; GFX1032-NEXT: s_add_i32 s3, s3, 1 @@ -570,6 +573,7 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1064-NEXT: s_branch .LBB11_4 +; GFX1064-NEXT: .p2align ; GFX1064-NEXT: .LBB11_2: ; %bb8 ; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1 ; GFX1064-NEXT: s_add_i32 s6, s6, 1 @@ -1538,6 +1542,7 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-NEXT: v_subrev_nc_u32_e32 v0, s0, v0 ; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: s_branch .LBB27_2 +; GFX1032-NEXT: .p2align ; GFX1032-NEXT: .LBB27_1: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB27_2 Depth=1 ; GFX1032-NEXT: s_xor_b32 s3, s1, -1 @@ -1576,6 +1581,7 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1064-NEXT: v_subrev_nc_u32_e32 v0, s0, v0 ; GFX1064-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-NEXT: s_branch .LBB27_2 +; GFX1064-NEXT: .p2align ; GFX1064-NEXT: .LBB27_1: ; %Flow ; GFX1064-NEXT: ; in Loop: Header=BB27_2 Depth=1 ; GFX1064-NEXT: s_xor_b64 s[6:7], s[2:3], -1 @@ -1811,6 +1817,7 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 { ; GFX1032-NEXT: v_mov_b32_e32 v4, v0 ; GFX1032-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-NEXT: s_branch .LBB33_2 +; GFX1032-NEXT: .p2align ; GFX1032-NEXT: .LBB33_1: ; %body ; GFX1032-NEXT: ; in Loop: Header=BB33_2 Depth=1 ; GFX1032-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -1843,6 +1850,7 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 { ; GFX1064-NEXT: v_mov_b32_e32 v4, v0 ; GFX1064-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-NEXT: s_branch .LBB33_2 +; GFX1064-NEXT: .p2align ; GFX1064-NEXT: .LBB33_1: ; %body ; GFX1064-NEXT: ; in Loop: Header=BB33_2 Depth=1 ; GFX1064-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll index 19c8e842a1390..ffac3adc82a4b 100644 --- a/llvm/test/CodeGen/AMDGPU/while-break.ll +++ b/llvm/test/CodeGen/AMDGPU/while-break.ll @@ -7,6 +7,7 @@ define amdgpu_ps float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 { ; GCN-NEXT: s_mov_b32 s1, -1 ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_branch .LBB0_2 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB0_1: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 @@ -82,6 +83,7 @@ define amdgpu_ps float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 { ; GCN-NEXT: s_mov_b32 s1, -1 ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_branch .LBB1_2 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB1_1: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 @@ -160,6 +162,7 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_branch .LBB2_2 +; GCN-NEXT: .p2align ; GCN-NEXT: .LBB2_1: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index ad8dcd3888e9f..84c791341f1b9 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -763,6 +763,7 @@ define amdgpu_ps float @test_wwm6_loop() { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-W64-NEXT: .p2align 4 ; GFX9-W64-NEXT: .LBB17_1: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -791,6 +792,7 @@ define amdgpu_ps float @test_wwm6_loop() { ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: s_mov_b32 s0, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 +; GFX10-W32-NEXT: .p2align 5 ; GFX10-W32-NEXT: .LBB17_1: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 @@ -1229,6 +1231,7 @@ define amdgpu_ps float @test_strict_wqm6_loop() { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-W64-NEXT: .p2align 4 ; GFX9-W64-NEXT: .LBB25_1: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec @@ -1260,6 +1263,7 @@ define amdgpu_ps float @test_strict_wqm6_loop() { ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: s_mov_b32 s0, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 +; GFX10-W32-NEXT: .p2align 5 ; GFX10-W32-NEXT: .LBB25_1: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo @@ -1936,6 +1940,7 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { ; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000 ; GFX9-W64-NEXT: s_branch .LBB35_2 +; GFX9-W64-NEXT: .p2align 4 ; GFX9-W64-NEXT: .LBB35_1: ; %body ; GFX9-W64-NEXT: ; in Loop: Header=BB35_2 Depth=1 ; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf @@ -2660,6 +2665,7 @@ define amdgpu_ps float @test_strict_wwm6_loop() { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-W64-NEXT: .p2align 4 ; GFX9-W64-NEXT: .LBB47_1: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -2688,6 +2694,7 @@ define amdgpu_ps float @test_strict_wwm6_loop() { ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: s_mov_b32 s0, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 +; GFX10-W32-NEXT: .p2align 5 ; GFX10-W32-NEXT: .LBB47_1: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1