diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index b7ac90e33f65e..86980ee851bb1 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -125,7 +125,7 @@ class SIFixSGPRCopies : public MachineFunctionPass { SmallVector PHINodes; SmallVector S2VCopies; unsigned NextVGPRToSGPRCopyID; - DenseMap V2SCopies; + MapVector V2SCopies; DenseMap> SiblingPenalty; public: @@ -988,7 +988,7 @@ bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { for (auto J : Info->Siblings) { auto InfoIt = V2SCopies.find(J); if (InfoIt != V2SCopies.end()) { - MachineInstr *SiblingCopy = InfoIt->getSecond().Copy; + MachineInstr *SiblingCopy = InfoIt->second.Copy; if (SiblingCopy->isImplicitDef()) // the COPY has already been MoveToVALUed continue; @@ -1023,12 +1023,12 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { unsigned CurID = LoweringWorklist.pop_back_val(); auto CurInfoIt = V2SCopies.find(CurID); if (CurInfoIt != V2SCopies.end()) { - V2SCopyInfo C = CurInfoIt->getSecond(); + V2SCopyInfo C = CurInfoIt->second; LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump()); for (auto S : C.Siblings) { auto SibInfoIt = V2SCopies.find(S); if (SibInfoIt != V2SCopies.end()) { - V2SCopyInfo &SI = SibInfoIt->getSecond(); + V2SCopyInfo &SI = SibInfoIt->second; LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump()); if (!SI.NeedToBeConvertedToVALU) { SI.SChain.set_subtract(C.SChain); @@ -1040,6 +1040,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { } LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy << " is being turned to VALU\n"); + // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if + // instead. V2SCopies.erase(C.ID); Copies.insert(C.Copy); } diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index b5a823355543c..96dbb03cded55 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -948,12 +948,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec - ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr38, $vgpr20, implicit $exec + ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec + ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr20, implicit $exec ; GFX90A-NEXT: renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec - ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr36, $vgpr10, implicit $exec - ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 $vgpr48, $vgpr12, implicit $exec + ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec + ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr10, implicit $exec + ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 $vgpr36, $vgpr12, implicit $exec ; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc @@ -975,20 +975,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr29 = COPY renamable $vgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr33 = COPY renamable $vgpr27, implicit $exec ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr10, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr27, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 50693a92bc92c..c6320658e7800 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -1965,7 +1965,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_mul_lo_u32 v2, s8, v4 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 ; VI-NEXT: v_mul_lo_u32 v3, s9, v5 -; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 ; VI-NEXT: v_mul_hi_u32 v6, v5, v0 ; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v5, v3, 0 diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll new file mode 100644 index 0000000000000..a78a8a2ed8fe6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s + +define amdgpu_gs void @f(i32 inreg %arg, i32 %arg1, i32 %arg2) { +; CHECK-LABEL: f: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %bb3 +; CHECK-NEXT: v_mov_b32_e32 v5, v0 +; CHECK-NEXT: s_branch .LBB0_3 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: v_mov_b32_e32 v5, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: .LBB0_3: ; %bb4 +; CHECK-NEXT: v_mov_b32_e32 v6, 0 +; CHECK-NEXT: s_mov_b32 s1, s0 +; CHECK-NEXT: s_mov_b32 s2, s0 +; CHECK-NEXT: s_mov_b32 s3, s0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_mov_b32_e32 v7, v6 +; CHECK-NEXT: v_mov_b32_e32 v8, v6 +; CHECK-NEXT: v_mov_b32_e32 v2, v6 +; CHECK-NEXT: v_mov_b32_e32 v3, v6 +; CHECK-NEXT: v_mov_b32_e32 v4, v6 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: buffer_store_b128 v[5:8], v6, s[0:3], 0 idxen +; CHECK-NEXT: buffer_store_b128 v[1:4], v6, s[0:3], 0 idxen +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; CHECK-NEXT: s_endpgm +bb: + %i = icmp eq i32 %arg, 0 + br i1 %i, label %bb4, label %bb3 + +bb3: + br label %bb4 + +bb4: + %i5 = phi i32 [ %arg1, %bb3 ], [ 1, %bb ] + %i6 = phi i32 [ %arg2, %bb3 ], [ 0, %bb ] + %i7 = insertelement <4 x i32> zeroinitializer, i32 %i5, i64 0 + %i8 = bitcast <4 x i32> %i7 to <4 x float> + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %i8, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0) + %i9 = insertelement <4 x i32> zeroinitializer, i32 %i6, i64 0 + %i10 = bitcast <4 x i32> %i9 to <4 x float> + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %i10, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 086fab3f52175..680c27b697709 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2 ; NOSDWA-NEXT: s_endpgm ; @@ -30,7 +30,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX89-NEXT: v_mov_b32_e32 v0, s0 ; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX89-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX89-NEXT: flat_store_dword v[0:1], v2 ; GFX89-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index aa65a62e242da..60f309c0c1882 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -509,7 +509,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %35:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5