-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Revert "[Transform][LoadStoreVectorizer] allow redundant in Chain (#1… #168105
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…vm#163019)" This reverts commit 92e5608.
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Gang Chen (cmc-rep) Changes…63019)" This reverts commit 92e5608. Patch is 109.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168105.diff 14 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 47bcfa467615b..7b5137b0185ab 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -626,35 +626,26 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
std::vector<Chain> Ret;
Ret.push_back({C.front()});
- unsigned ElemBytes = DL.getTypeStoreSize(getChainElemTy(C));
- APInt PrevReadEnd = C[0].OffsetFromLeader +
- DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
auto &CurChain = Ret.back();
- unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
+ const ChainElem &Prev = CurChain.back();
+ unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
+ assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
+ "collectEquivalenceClass");
+ APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
// Add this instruction to the end of the current chain, or start a new one.
- assert(SzBytes % ElemBytes == 0);
- APInt ReadEnd = It->OffsetFromLeader + SzBytes;
- // Allow redundancy: partial or full overlap counts as contiguous.
- bool AreContiguous = false;
- if (It->OffsetFromLeader.sle(PrevReadEnd)) {
- uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
- if (Overlap % ElemBytes == 0)
- AreContiguous = true;
- }
-
- LLVM_DEBUG(dbgs() << "LSV: Instruction is "
- << (AreContiguous ? "contiguous" : "chain-breaker")
- << *It->Inst << " (starts at offset "
+ bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
+ LLVM_DEBUG(dbgs() << "LSV: Instructions are "
+ << (AreContiguous ? "" : "not ") << "contiguous: "
+ << *Prev.Inst << " (ends at offset " << PrevReadEnd
+ << ") -> " << *It->Inst << " (starts at offset "
<< It->OffsetFromLeader << ")\n");
-
if (AreContiguous)
CurChain.push_back(*It);
else
Ret.push_back({*It});
- PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}
// Filter out length-1 chains, these are uninteresting.
@@ -883,24 +874,15 @@ bool Vectorizer::vectorizeChain(Chain &C) {
Type *VecElemTy = getChainElemTy(C);
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
- unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
- APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
- unsigned ChainBytes = BytesAdded;
- for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
- unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
- APInt ReadEnd = It->OffsetFromLeader + SzBytes;
- // Update ChainBytes considering possible overlap.
- BytesAdded =
- PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
- ChainBytes += BytesAdded;
- PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
- }
-
+ unsigned ChainBytes = std::accumulate(
+ C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
+ return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
+ });
assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
// than 1 byte (e.g. VecTy == <32 x i1>).
- unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
- Type *VecTy = FixedVectorType::get(VecElemTy, NumElem);
+ Type *VecTy = FixedVectorType::get(
+ VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
Align Alignment = getLoadStoreAlignment(C[0].Inst);
// If this is a load/store of an alloca, we might have upgraded the alloca's
@@ -927,31 +909,27 @@ bool Vectorizer::vectorizeChain(Chain &C) {
llvm::min_element(C, [](const auto &A, const auto &B) {
return A.Inst->comesBefore(B.Inst);
})->Inst);
- // This can happen due to a chain of redundant loads.
- // In this case, just use the element-type, and avoid ExtractElement.
- if (NumElem == 1)
- VecTy = VecElemTy;
+
// Chain is in offset order, so C[0] is the instr with the lowest offset,
// i.e. the root of the vector.
VecInst = Builder.CreateAlignedLoad(VecTy,
getLoadStorePointerOperand(C[0].Inst),
Alignment);
+ unsigned VecIdx = 0;
for (const ChainElem &E : C) {
Instruction *I = E.Inst;
Value *V;
Type *T = getLoadStoreType(I);
- int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
- int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (auto *VT = dyn_cast<FixedVectorType>(T)) {
auto Mask = llvm::to_vector<8>(
llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
- } else if (VecTy != VecElemTy) {
+ VecIdx += VT->getNumElements();
+ } else {
V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
I->getName());
- } else {
- V = VecInst;
+ ++VecIdx;
}
if (V->getType() != I->getType())
V = Builder.CreateBitOrPointerCast(V, I->getType());
@@ -986,24 +964,22 @@ bool Vectorizer::vectorizeChain(Chain &C) {
// Build the vector to store.
Value *Vec = PoisonValue::get(VecTy);
- auto InsertElem = [&](Value *V, unsigned VecIdx) {
+ unsigned VecIdx = 0;
+ auto InsertElem = [&](Value *V) {
if (V->getType() != VecElemTy)
V = Builder.CreateBitOrPointerCast(V, VecElemTy);
- Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
+ Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
};
for (const ChainElem &E : C) {
auto *I = cast<StoreInst>(E.Inst);
- int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
- int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (FixedVectorType *VT =
dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
- Builder.getInt32(J)),
- VecIdx++);
+ Builder.getInt32(J)));
}
} else {
- InsertElem(I->getValueOperand(), VecIdx);
+ InsertElem(I->getValueOperand());
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index c935310584949..4e70c15df5741 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -3850,9 +3850,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) poison`, addrspace 1)
- ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[LOAD2]](s32)
- ; CHECK-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p5) = G_INTTOPTR [[LOAD2]](s32)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `ptr addrspace(1) poison`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) poison`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3881,10 +3880,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
- ; CHECK-NEXT: G_STORE [[INTTOPTR]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
- ; CHECK-NEXT: G_STORE [[INTTOPTR1]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index aaf7be9ffe112..5c526c78afcd7 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -13,15 +13,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
- ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg3.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg3.kernarg.offset.align.down + 16, align 8, addrspace 4)
- ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 0, implicit-def $scc
+ ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc
- ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 8, implicit-def $scc
+ ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_CSELECT_B64 -1, 0, implicit killed $scc
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr30_sgpr31, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $vgpr5 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
@@ -32,7 +33,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1.bb103:
; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -40,7 +41,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.2:
; GFX90A-NEXT: successors: %bb.3(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF implicit-def $vgpr16
; GFX90A-NEXT: renamable $vgpr3 = IMPLICIT_DEF implicit-def $vgpr2
@@ -51,7 +52,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.3.Flow17:
; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr6 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc
@@ -59,7 +60,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.4.bb15:
; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec
@@ -358,7 +359,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.35.bb20:
; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i23)
; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec
@@ -405,7 +406,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.37.bb27:
; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67, $sgpr48_sgpr49
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, ...
[truncated]
|
|
@llvm/pr-subscribers-vectorizers Author: Gang Chen (cmc-rep) Changes…63019)" This reverts commit 92e5608. Patch is 109.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168105.diff 14 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 47bcfa467615b..7b5137b0185ab 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -626,35 +626,26 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
std::vector<Chain> Ret;
Ret.push_back({C.front()});
- unsigned ElemBytes = DL.getTypeStoreSize(getChainElemTy(C));
- APInt PrevReadEnd = C[0].OffsetFromLeader +
- DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
auto &CurChain = Ret.back();
- unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
+ const ChainElem &Prev = CurChain.back();
+ unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
+ assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
+ "collectEquivalenceClass");
+ APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
// Add this instruction to the end of the current chain, or start a new one.
- assert(SzBytes % ElemBytes == 0);
- APInt ReadEnd = It->OffsetFromLeader + SzBytes;
- // Allow redundancy: partial or full overlap counts as contiguous.
- bool AreContiguous = false;
- if (It->OffsetFromLeader.sle(PrevReadEnd)) {
- uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
- if (Overlap % ElemBytes == 0)
- AreContiguous = true;
- }
-
- LLVM_DEBUG(dbgs() << "LSV: Instruction is "
- << (AreContiguous ? "contiguous" : "chain-breaker")
- << *It->Inst << " (starts at offset "
+ bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
+ LLVM_DEBUG(dbgs() << "LSV: Instructions are "
+ << (AreContiguous ? "" : "not ") << "contiguous: "
+ << *Prev.Inst << " (ends at offset " << PrevReadEnd
+ << ") -> " << *It->Inst << " (starts at offset "
<< It->OffsetFromLeader << ")\n");
-
if (AreContiguous)
CurChain.push_back(*It);
else
Ret.push_back({*It});
- PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}
// Filter out length-1 chains, these are uninteresting.
@@ -883,24 +874,15 @@ bool Vectorizer::vectorizeChain(Chain &C) {
Type *VecElemTy = getChainElemTy(C);
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
- unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
- APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
- unsigned ChainBytes = BytesAdded;
- for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
- unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
- APInt ReadEnd = It->OffsetFromLeader + SzBytes;
- // Update ChainBytes considering possible overlap.
- BytesAdded =
- PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
- ChainBytes += BytesAdded;
- PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
- }
-
+ unsigned ChainBytes = std::accumulate(
+ C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
+ return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
+ });
assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
// than 1 byte (e.g. VecTy == <32 x i1>).
- unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
- Type *VecTy = FixedVectorType::get(VecElemTy, NumElem);
+ Type *VecTy = FixedVectorType::get(
+ VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
Align Alignment = getLoadStoreAlignment(C[0].Inst);
// If this is a load/store of an alloca, we might have upgraded the alloca's
@@ -927,31 +909,27 @@ bool Vectorizer::vectorizeChain(Chain &C) {
llvm::min_element(C, [](const auto &A, const auto &B) {
return A.Inst->comesBefore(B.Inst);
})->Inst);
- // This can happen due to a chain of redundant loads.
- // In this case, just use the element-type, and avoid ExtractElement.
- if (NumElem == 1)
- VecTy = VecElemTy;
+
// Chain is in offset order, so C[0] is the instr with the lowest offset,
// i.e. the root of the vector.
VecInst = Builder.CreateAlignedLoad(VecTy,
getLoadStorePointerOperand(C[0].Inst),
Alignment);
+ unsigned VecIdx = 0;
for (const ChainElem &E : C) {
Instruction *I = E.Inst;
Value *V;
Type *T = getLoadStoreType(I);
- int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
- int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (auto *VT = dyn_cast<FixedVectorType>(T)) {
auto Mask = llvm::to_vector<8>(
llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
- } else if (VecTy != VecElemTy) {
+ VecIdx += VT->getNumElements();
+ } else {
V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
I->getName());
- } else {
- V = VecInst;
+ ++VecIdx;
}
if (V->getType() != I->getType())
V = Builder.CreateBitOrPointerCast(V, I->getType());
@@ -986,24 +964,22 @@ bool Vectorizer::vectorizeChain(Chain &C) {
// Build the vector to store.
Value *Vec = PoisonValue::get(VecTy);
- auto InsertElem = [&](Value *V, unsigned VecIdx) {
+ unsigned VecIdx = 0;
+ auto InsertElem = [&](Value *V) {
if (V->getType() != VecElemTy)
V = Builder.CreateBitOrPointerCast(V, VecElemTy);
- Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
+ Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
};
for (const ChainElem &E : C) {
auto *I = cast<StoreInst>(E.Inst);
- int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
- int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (FixedVectorType *VT =
dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
- Builder.getInt32(J)),
- VecIdx++);
+ Builder.getInt32(J)));
}
} else {
- InsertElem(I->getValueOperand(), VecIdx);
+ InsertElem(I->getValueOperand());
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index c935310584949..4e70c15df5741 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -3850,9 +3850,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) poison`, addrspace 1)
- ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[LOAD2]](s32)
- ; CHECK-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p5) = G_INTTOPTR [[LOAD2]](s32)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `ptr addrspace(1) poison`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) poison`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3881,10 +3880,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
- ; CHECK-NEXT: G_STORE [[INTTOPTR]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
- ; CHECK-NEXT: G_STORE [[INTTOPTR1]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index aaf7be9ffe112..5c526c78afcd7 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -13,15 +13,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
- ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg3.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg3.kernarg.offset.align.down + 16, align 8, addrspace 4)
- ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 0, implicit-def $scc
+ ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc
- ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 8, implicit-def $scc
+ ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_CSELECT_B64 -1, 0, implicit killed $scc
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr30_sgpr31, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $vgpr5 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
@@ -32,7 +33,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1.bb103:
; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -40,7 +41,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.2:
; GFX90A-NEXT: successors: %bb.3(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF implicit-def $vgpr16
; GFX90A-NEXT: renamable $vgpr3 = IMPLICIT_DEF implicit-def $vgpr2
@@ -51,7 +52,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.3.Flow17:
; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr6 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc
@@ -59,7 +60,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.4.bb15:
; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec
@@ -358,7 +359,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.35.bb20:
; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i23)
; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec
@@ -405,7 +406,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.37.bb27:
; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67, $sgpr48_sgpr49
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, ...
[truncated]
|
|
@llvm/pr-subscribers-llvm-transforms Author: Gang Chen (cmc-rep) Changes…63019)" This reverts commit 92e5608. Patch is 109.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168105.diff 14 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 47bcfa467615b..7b5137b0185ab 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -626,35 +626,26 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
std::vector<Chain> Ret;
Ret.push_back({C.front()});
- unsigned ElemBytes = DL.getTypeStoreSize(getChainElemTy(C));
- APInt PrevReadEnd = C[0].OffsetFromLeader +
- DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
auto &CurChain = Ret.back();
- unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
+ const ChainElem &Prev = CurChain.back();
+ unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
+ assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
+ "collectEquivalenceClass");
+ APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
// Add this instruction to the end of the current chain, or start a new one.
- assert(SzBytes % ElemBytes == 0);
- APInt ReadEnd = It->OffsetFromLeader + SzBytes;
- // Allow redundancy: partial or full overlap counts as contiguous.
- bool AreContiguous = false;
- if (It->OffsetFromLeader.sle(PrevReadEnd)) {
- uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
- if (Overlap % ElemBytes == 0)
- AreContiguous = true;
- }
-
- LLVM_DEBUG(dbgs() << "LSV: Instruction is "
- << (AreContiguous ? "contiguous" : "chain-breaker")
- << *It->Inst << " (starts at offset "
+ bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
+ LLVM_DEBUG(dbgs() << "LSV: Instructions are "
+ << (AreContiguous ? "" : "not ") << "contiguous: "
+ << *Prev.Inst << " (ends at offset " << PrevReadEnd
+ << ") -> " << *It->Inst << " (starts at offset "
<< It->OffsetFromLeader << ")\n");
-
if (AreContiguous)
CurChain.push_back(*It);
else
Ret.push_back({*It});
- PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}
// Filter out length-1 chains, these are uninteresting.
@@ -883,24 +874,15 @@ bool Vectorizer::vectorizeChain(Chain &C) {
Type *VecElemTy = getChainElemTy(C);
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
- unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
- APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
- unsigned ChainBytes = BytesAdded;
- for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
- unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
- APInt ReadEnd = It->OffsetFromLeader + SzBytes;
- // Update ChainBytes considering possible overlap.
- BytesAdded =
- PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
- ChainBytes += BytesAdded;
- PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
- }
-
+ unsigned ChainBytes = std::accumulate(
+ C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
+ return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
+ });
assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
// than 1 byte (e.g. VecTy == <32 x i1>).
- unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
- Type *VecTy = FixedVectorType::get(VecElemTy, NumElem);
+ Type *VecTy = FixedVectorType::get(
+ VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
Align Alignment = getLoadStoreAlignment(C[0].Inst);
// If this is a load/store of an alloca, we might have upgraded the alloca's
@@ -927,31 +909,27 @@ bool Vectorizer::vectorizeChain(Chain &C) {
llvm::min_element(C, [](const auto &A, const auto &B) {
return A.Inst->comesBefore(B.Inst);
})->Inst);
- // This can happen due to a chain of redundant loads.
- // In this case, just use the element-type, and avoid ExtractElement.
- if (NumElem == 1)
- VecTy = VecElemTy;
+
// Chain is in offset order, so C[0] is the instr with the lowest offset,
// i.e. the root of the vector.
VecInst = Builder.CreateAlignedLoad(VecTy,
getLoadStorePointerOperand(C[0].Inst),
Alignment);
+ unsigned VecIdx = 0;
for (const ChainElem &E : C) {
Instruction *I = E.Inst;
Value *V;
Type *T = getLoadStoreType(I);
- int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
- int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (auto *VT = dyn_cast<FixedVectorType>(T)) {
auto Mask = llvm::to_vector<8>(
llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
- } else if (VecTy != VecElemTy) {
+ VecIdx += VT->getNumElements();
+ } else {
V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
I->getName());
- } else {
- V = VecInst;
+ ++VecIdx;
}
if (V->getType() != I->getType())
V = Builder.CreateBitOrPointerCast(V, I->getType());
@@ -986,24 +964,22 @@ bool Vectorizer::vectorizeChain(Chain &C) {
// Build the vector to store.
Value *Vec = PoisonValue::get(VecTy);
- auto InsertElem = [&](Value *V, unsigned VecIdx) {
+ unsigned VecIdx = 0;
+ auto InsertElem = [&](Value *V) {
if (V->getType() != VecElemTy)
V = Builder.CreateBitOrPointerCast(V, VecElemTy);
- Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
+ Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
};
for (const ChainElem &E : C) {
auto *I = cast<StoreInst>(E.Inst);
- int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
- int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (FixedVectorType *VT =
dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
- Builder.getInt32(J)),
- VecIdx++);
+ Builder.getInt32(J)));
}
} else {
- InsertElem(I->getValueOperand(), VecIdx);
+ InsertElem(I->getValueOperand());
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index c935310584949..4e70c15df5741 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -3850,9 +3850,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) poison`, addrspace 1)
- ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[LOAD2]](s32)
- ; CHECK-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p5) = G_INTTOPTR [[LOAD2]](s32)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `ptr addrspace(1) poison`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) poison`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3881,10 +3880,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
- ; CHECK-NEXT: G_STORE [[INTTOPTR]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
- ; CHECK-NEXT: G_STORE [[INTTOPTR1]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index aaf7be9ffe112..5c526c78afcd7 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -13,15 +13,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
- ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg3.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg3.kernarg.offset.align.down + 16, align 8, addrspace 4)
- ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 0, implicit-def $scc
+ ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc
- ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 8, implicit-def $scc
+ ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_CSELECT_B64 -1, 0, implicit killed $scc
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr30_sgpr31, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $vgpr5 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
@@ -32,7 +33,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1.bb103:
; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -40,7 +41,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.2:
; GFX90A-NEXT: successors: %bb.3(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF implicit-def $vgpr16
; GFX90A-NEXT: renamable $vgpr3 = IMPLICIT_DEF implicit-def $vgpr2
@@ -51,7 +52,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.3.Flow17:
; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr6 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc
@@ -59,7 +60,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.4.bb15:
; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec
@@ -358,7 +359,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.35.bb20:
; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i23)
; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec
@@ -405,7 +406,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.37.bb27:
; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67, $sgpr48_sgpr49
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, ...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)' 'HEAD~1' HEAD llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll llvm/test/CodeGen/AMDGPU/mad_uint24.ll llvm/test/CodeGen/AMDGPU/sad.ll llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.llThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
…63019)"
This reverts commit 92e5608.