-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Re-land [Transform][LoadStoreVectorizer] allow redundant in Chain #168135
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This is the fixed version of llvm#163019
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: Gang Chen (cmc-rep) ChangesThis is the fixed version of #163019 Patch is 113.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168135.diff 16 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7b5137b0185ab..7f11f95d79b0d 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -626,26 +626,35 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
std::vector<Chain> Ret;
Ret.push_back({C.front()});
+ unsigned ElemBytes = DL.getTypeStoreSize(getChainElemTy(C));
+ APInt PrevReadEnd = C[0].OffsetFromLeader +
+ DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
auto &CurChain = Ret.back();
- const ChainElem &Prev = CurChain.back();
- unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
- assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
- "collectEquivalenceClass");
- APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
+ unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
// Add this instruction to the end of the current chain, or start a new one.
- bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
- LLVM_DEBUG(dbgs() << "LSV: Instructions are "
- << (AreContiguous ? "" : "not ") << "contiguous: "
- << *Prev.Inst << " (ends at offset " << PrevReadEnd
- << ") -> " << *It->Inst << " (starts at offset "
+ assert(SzBytes % ElemBytes == 0);
+ APInt ReadEnd = It->OffsetFromLeader + SzBytes;
+ // Allow redundancy: partial or full overlap counts as contiguous.
+ bool AreContiguous = false;
+ if (It->OffsetFromLeader.sle(PrevReadEnd)) {
+ uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
+ if (Overlap % ElemBytes == 0)
+ AreContiguous = true;
+ }
+
+ LLVM_DEBUG(dbgs() << "LSV: Instruction is "
+ << (AreContiguous ? "contiguous" : "chain-breaker")
+ << *It->Inst << " (starts at offset "
<< It->OffsetFromLeader << ")\n");
+
if (AreContiguous)
CurChain.push_back(*It);
else
Ret.push_back({*It});
+ PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}
// Filter out length-1 chains, these are uninteresting.
@@ -727,14 +736,20 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
// These chains are over the closed interval [CBegin, CEnd].
SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
CandidateChains;
+
+ unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(C[CBegin].Inst));
+ APInt PrevReadEnd = C[CBegin].OffsetFromLeader + BytesAdded;
+ unsigned Sz = BytesAdded;
for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
- APInt Sz = C[CEnd].OffsetFromLeader +
- DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
- C[CBegin].OffsetFromLeader;
- if (Sz.sgt(VecRegBytes))
+ APInt ReadEnd = C[CEnd].OffsetFromLeader +
+ DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst));
+ BytesAdded =
+ PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
+ Sz += BytesAdded;
+ if (Sz > VecRegBytes)
break;
- CandidateChains.emplace_back(CEnd,
- static_cast<unsigned>(Sz.getLimitedValue()));
+ CandidateChains.emplace_back(CEnd, Sz);
+ PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}
// Consider the longest chain first.
@@ -874,15 +889,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
Type *VecElemTy = getChainElemTy(C);
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
- unsigned ChainBytes = std::accumulate(
- C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
- return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
- });
+ unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
+ APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
+ unsigned ChainBytes = BytesAdded;
+ for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
+ unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
+ APInt ReadEnd = It->OffsetFromLeader + SzBytes;
+ // Update ChainBytes considering possible overlap.
+ BytesAdded =
+ PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
+ ChainBytes += BytesAdded;
+ PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
+ }
+
assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
// than 1 byte (e.g. VecTy == <32 x i1>).
- Type *VecTy = FixedVectorType::get(
- VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
+ unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
+ Type *VecTy = FixedVectorType::get(VecElemTy, NumElem);
Align Alignment = getLoadStoreAlignment(C[0].Inst);
// If this is a load/store of an alloca, we might have upgraded the alloca's
@@ -909,27 +933,31 @@ bool Vectorizer::vectorizeChain(Chain &C) {
llvm::min_element(C, [](const auto &A, const auto &B) {
return A.Inst->comesBefore(B.Inst);
})->Inst);
-
+ // This can happen due to a chain of redundant loads.
+ // In this case, just use the element-type, and avoid ExtractElement.
+ if (NumElem == 1)
+ VecTy = VecElemTy;
// Chain is in offset order, so C[0] is the instr with the lowest offset,
// i.e. the root of the vector.
VecInst = Builder.CreateAlignedLoad(VecTy,
getLoadStorePointerOperand(C[0].Inst),
Alignment);
- unsigned VecIdx = 0;
for (const ChainElem &E : C) {
Instruction *I = E.Inst;
Value *V;
Type *T = getLoadStoreType(I);
+ int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (auto *VT = dyn_cast<FixedVectorType>(T)) {
auto Mask = llvm::to_vector<8>(
llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
- VecIdx += VT->getNumElements();
- } else {
+ } else if (VecTy != VecElemTy) {
V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
I->getName());
- ++VecIdx;
+ } else {
+ V = VecInst;
}
if (V->getType() != I->getType())
V = Builder.CreateBitOrPointerCast(V, I->getType());
@@ -964,22 +992,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
// Build the vector to store.
Value *Vec = PoisonValue::get(VecTy);
- unsigned VecIdx = 0;
- auto InsertElem = [&](Value *V) {
+ auto InsertElem = [&](Value *V, unsigned VecIdx) {
if (V->getType() != VecElemTy)
V = Builder.CreateBitOrPointerCast(V, VecElemTy);
- Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
+ Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
};
for (const ChainElem &E : C) {
auto *I = cast<StoreInst>(E.Inst);
+ int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (FixedVectorType *VT =
dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
- Builder.getInt32(J)));
+ Builder.getInt32(J)),
+ VecIdx++);
}
} else {
- InsertElem(I->getValueOperand());
+ InsertElem(I->getValueOperand(), VecIdx);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 4e70c15df5741..c935310584949 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -3850,8 +3850,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `ptr addrspace(1) poison`, addrspace 1)
- ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) poison`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) poison`, addrspace 1)
+ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[LOAD2]](s32)
+ ; CHECK-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p5) = G_INTTOPTR [[LOAD2]](s32)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3880,10 +3881,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
- ; CHECK-NEXT: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[INTTOPTR]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
- ; CHECK-NEXT: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[INTTOPTR1]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 5c526c78afcd7..aaf7be9ffe112 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -13,16 +13,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
- ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg3.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
- ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc
+ ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg3.kernarg.offset.align.down + 16, align 8, addrspace 4)
+ ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 0, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc
- ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc
+ ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 8, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_CSELECT_B64 -1, 0, implicit killed $scc
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr30_sgpr31, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $vgpr5 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
@@ -33,7 +32,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1.bb103:
; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -41,7 +40,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.2:
; GFX90A-NEXT: successors: %bb.3(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF implicit-def $vgpr16
; GFX90A-NEXT: renamable $vgpr3 = IMPLICIT_DEF implicit-def $vgpr2
@@ -52,7 +51,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.3.Flow17:
; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr6 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc
@@ -60,7 +59,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.4.bb15:
; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec
@@ -359,7 +358,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.35.bb20:
; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{...
[truncated]
|
|
@llvm/pr-subscribers-backend-amdgpu Author: Gang Chen (cmc-rep) ChangesThis is the fixed version of #163019 Patch is 113.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168135.diff 16 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7b5137b0185ab..7f11f95d79b0d 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -626,26 +626,35 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
std::vector<Chain> Ret;
Ret.push_back({C.front()});
+ unsigned ElemBytes = DL.getTypeStoreSize(getChainElemTy(C));
+ APInt PrevReadEnd = C[0].OffsetFromLeader +
+ DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
auto &CurChain = Ret.back();
- const ChainElem &Prev = CurChain.back();
- unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
- assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
- "collectEquivalenceClass");
- APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
+ unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
// Add this instruction to the end of the current chain, or start a new one.
- bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
- LLVM_DEBUG(dbgs() << "LSV: Instructions are "
- << (AreContiguous ? "" : "not ") << "contiguous: "
- << *Prev.Inst << " (ends at offset " << PrevReadEnd
- << ") -> " << *It->Inst << " (starts at offset "
+ assert(SzBytes % ElemBytes == 0);
+ APInt ReadEnd = It->OffsetFromLeader + SzBytes;
+ // Allow redundancy: partial or full overlap counts as contiguous.
+ bool AreContiguous = false;
+ if (It->OffsetFromLeader.sle(PrevReadEnd)) {
+ uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
+ if (Overlap % ElemBytes == 0)
+ AreContiguous = true;
+ }
+
+ LLVM_DEBUG(dbgs() << "LSV: Instruction is "
+ << (AreContiguous ? "contiguous" : "chain-breaker")
+ << *It->Inst << " (starts at offset "
<< It->OffsetFromLeader << ")\n");
+
if (AreContiguous)
CurChain.push_back(*It);
else
Ret.push_back({*It});
+ PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}
// Filter out length-1 chains, these are uninteresting.
@@ -727,14 +736,20 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
// These chains are over the closed interval [CBegin, CEnd].
SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
CandidateChains;
+
+ unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(C[CBegin].Inst));
+ APInt PrevReadEnd = C[CBegin].OffsetFromLeader + BytesAdded;
+ unsigned Sz = BytesAdded;
for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
- APInt Sz = C[CEnd].OffsetFromLeader +
- DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
- C[CBegin].OffsetFromLeader;
- if (Sz.sgt(VecRegBytes))
+ APInt ReadEnd = C[CEnd].OffsetFromLeader +
+ DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst));
+ BytesAdded =
+ PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
+ Sz += BytesAdded;
+ if (Sz > VecRegBytes)
break;
- CandidateChains.emplace_back(CEnd,
- static_cast<unsigned>(Sz.getLimitedValue()));
+ CandidateChains.emplace_back(CEnd, Sz);
+ PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}
// Consider the longest chain first.
@@ -874,15 +889,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
Type *VecElemTy = getChainElemTy(C);
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
- unsigned ChainBytes = std::accumulate(
- C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
- return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
- });
+ unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
+ APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
+ unsigned ChainBytes = BytesAdded;
+ for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
+ unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
+ APInt ReadEnd = It->OffsetFromLeader + SzBytes;
+ // Update ChainBytes considering possible overlap.
+ BytesAdded =
+ PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
+ ChainBytes += BytesAdded;
+ PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
+ }
+
assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
// than 1 byte (e.g. VecTy == <32 x i1>).
- Type *VecTy = FixedVectorType::get(
- VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
+ unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
+ Type *VecTy = FixedVectorType::get(VecElemTy, NumElem);
Align Alignment = getLoadStoreAlignment(C[0].Inst);
// If this is a load/store of an alloca, we might have upgraded the alloca's
@@ -909,27 +933,31 @@ bool Vectorizer::vectorizeChain(Chain &C) {
llvm::min_element(C, [](const auto &A, const auto &B) {
return A.Inst->comesBefore(B.Inst);
})->Inst);
-
+ // This can happen due to a chain of redundant loads.
+ // In this case, just use the element-type, and avoid ExtractElement.
+ if (NumElem == 1)
+ VecTy = VecElemTy;
// Chain is in offset order, so C[0] is the instr with the lowest offset,
// i.e. the root of the vector.
VecInst = Builder.CreateAlignedLoad(VecTy,
getLoadStorePointerOperand(C[0].Inst),
Alignment);
- unsigned VecIdx = 0;
for (const ChainElem &E : C) {
Instruction *I = E.Inst;
Value *V;
Type *T = getLoadStoreType(I);
+ int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (auto *VT = dyn_cast<FixedVectorType>(T)) {
auto Mask = llvm::to_vector<8>(
llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
- VecIdx += VT->getNumElements();
- } else {
+ } else if (VecTy != VecElemTy) {
V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
I->getName());
- ++VecIdx;
+ } else {
+ V = VecInst;
}
if (V->getType() != I->getType())
V = Builder.CreateBitOrPointerCast(V, I->getType());
@@ -964,22 +992,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
// Build the vector to store.
Value *Vec = PoisonValue::get(VecTy);
- unsigned VecIdx = 0;
- auto InsertElem = [&](Value *V) {
+ auto InsertElem = [&](Value *V, unsigned VecIdx) {
if (V->getType() != VecElemTy)
V = Builder.CreateBitOrPointerCast(V, VecElemTy);
- Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
+ Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
};
for (const ChainElem &E : C) {
auto *I = cast<StoreInst>(E.Inst);
+ int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (FixedVectorType *VT =
dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
- Builder.getInt32(J)));
+ Builder.getInt32(J)),
+ VecIdx++);
}
} else {
- InsertElem(I->getValueOperand());
+ InsertElem(I->getValueOperand(), VecIdx);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 4e70c15df5741..c935310584949 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -3850,8 +3850,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `ptr addrspace(1) poison`, addrspace 1)
- ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) poison`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) poison`, addrspace 1)
+ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[LOAD2]](s32)
+ ; CHECK-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p5) = G_INTTOPTR [[LOAD2]](s32)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3880,10 +3881,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
- ; CHECK-NEXT: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[INTTOPTR]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
- ; CHECK-NEXT: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[INTTOPTR1]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 5c526c78afcd7..aaf7be9ffe112 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -13,16 +13,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
- ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg3.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
- ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc
+ ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg3.kernarg.offset.align.down + 16, align 8, addrspace 4)
+ ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 0, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc
- ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc
+ ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 8, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_CSELECT_B64 -1, 0, implicit killed $scc
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr30_sgpr31, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $vgpr5 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
@@ -33,7 +32,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1.bb103:
; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -41,7 +40,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.2:
; GFX90A-NEXT: successors: %bb.3(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF implicit-def $vgpr16
; GFX90A-NEXT: renamable $vgpr3 = IMPLICIT_DEF implicit-def $vgpr2
@@ -52,7 +51,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.3.Flow17:
; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr6 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc
@@ -60,7 +59,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.4.bb15:
; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec
@@ -359,7 +358,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.35.bb20:
; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{...
[truncated]
|
|
@llvm/pr-subscribers-llvm-globalisel Author: Gang Chen (cmc-rep) ChangesThis is the fixed version of #163019 Patch is 113.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168135.diff 16 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7b5137b0185ab..7f11f95d79b0d 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -626,26 +626,35 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
std::vector<Chain> Ret;
Ret.push_back({C.front()});
+ unsigned ElemBytes = DL.getTypeStoreSize(getChainElemTy(C));
+ APInt PrevReadEnd = C[0].OffsetFromLeader +
+ DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
auto &CurChain = Ret.back();
- const ChainElem &Prev = CurChain.back();
- unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
- assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
- "collectEquivalenceClass");
- APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
+ unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
// Add this instruction to the end of the current chain, or start a new one.
- bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
- LLVM_DEBUG(dbgs() << "LSV: Instructions are "
- << (AreContiguous ? "" : "not ") << "contiguous: "
- << *Prev.Inst << " (ends at offset " << PrevReadEnd
- << ") -> " << *It->Inst << " (starts at offset "
+ assert(SzBytes % ElemBytes == 0);
+ APInt ReadEnd = It->OffsetFromLeader + SzBytes;
+ // Allow redundancy: partial or full overlap counts as contiguous.
+ bool AreContiguous = false;
+ if (It->OffsetFromLeader.sle(PrevReadEnd)) {
+ uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
+ if (Overlap % ElemBytes == 0)
+ AreContiguous = true;
+ }
+
+ LLVM_DEBUG(dbgs() << "LSV: Instruction is "
+ << (AreContiguous ? "contiguous" : "chain-breaker")
+ << *It->Inst << " (starts at offset "
<< It->OffsetFromLeader << ")\n");
+
if (AreContiguous)
CurChain.push_back(*It);
else
Ret.push_back({*It});
+ PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}
// Filter out length-1 chains, these are uninteresting.
@@ -727,14 +736,20 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
// These chains are over the closed interval [CBegin, CEnd].
SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
CandidateChains;
+
+ unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(C[CBegin].Inst));
+ APInt PrevReadEnd = C[CBegin].OffsetFromLeader + BytesAdded;
+ unsigned Sz = BytesAdded;
for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
- APInt Sz = C[CEnd].OffsetFromLeader +
- DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
- C[CBegin].OffsetFromLeader;
- if (Sz.sgt(VecRegBytes))
+ APInt ReadEnd = C[CEnd].OffsetFromLeader +
+ DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst));
+ BytesAdded =
+ PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
+ Sz += BytesAdded;
+ if (Sz > VecRegBytes)
break;
- CandidateChains.emplace_back(CEnd,
- static_cast<unsigned>(Sz.getLimitedValue()));
+ CandidateChains.emplace_back(CEnd, Sz);
+ PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}
// Consider the longest chain first.
@@ -874,15 +889,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
Type *VecElemTy = getChainElemTy(C);
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
- unsigned ChainBytes = std::accumulate(
- C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
- return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
- });
+ unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
+ APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
+ unsigned ChainBytes = BytesAdded;
+ for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
+ unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
+ APInt ReadEnd = It->OffsetFromLeader + SzBytes;
+ // Update ChainBytes considering possible overlap.
+ BytesAdded =
+ PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
+ ChainBytes += BytesAdded;
+ PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
+ }
+
assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
// than 1 byte (e.g. VecTy == <32 x i1>).
- Type *VecTy = FixedVectorType::get(
- VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
+ unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
+ Type *VecTy = FixedVectorType::get(VecElemTy, NumElem);
Align Alignment = getLoadStoreAlignment(C[0].Inst);
// If this is a load/store of an alloca, we might have upgraded the alloca's
@@ -909,27 +933,31 @@ bool Vectorizer::vectorizeChain(Chain &C) {
llvm::min_element(C, [](const auto &A, const auto &B) {
return A.Inst->comesBefore(B.Inst);
})->Inst);
-
+ // This can happen due to a chain of redundant loads.
+ // In this case, just use the element-type, and avoid ExtractElement.
+ if (NumElem == 1)
+ VecTy = VecElemTy;
// Chain is in offset order, so C[0] is the instr with the lowest offset,
// i.e. the root of the vector.
VecInst = Builder.CreateAlignedLoad(VecTy,
getLoadStorePointerOperand(C[0].Inst),
Alignment);
- unsigned VecIdx = 0;
for (const ChainElem &E : C) {
Instruction *I = E.Inst;
Value *V;
Type *T = getLoadStoreType(I);
+ int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (auto *VT = dyn_cast<FixedVectorType>(T)) {
auto Mask = llvm::to_vector<8>(
llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
- VecIdx += VT->getNumElements();
- } else {
+ } else if (VecTy != VecElemTy) {
V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
I->getName());
- ++VecIdx;
+ } else {
+ V = VecInst;
}
if (V->getType() != I->getType())
V = Builder.CreateBitOrPointerCast(V, I->getType());
@@ -964,22 +992,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
// Build the vector to store.
Value *Vec = PoisonValue::get(VecTy);
- unsigned VecIdx = 0;
- auto InsertElem = [&](Value *V) {
+ auto InsertElem = [&](Value *V, unsigned VecIdx) {
if (V->getType() != VecElemTy)
V = Builder.CreateBitOrPointerCast(V, VecElemTy);
- Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
+ Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
};
for (const ChainElem &E : C) {
auto *I = cast<StoreInst>(E.Inst);
+ int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (FixedVectorType *VT =
dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
- Builder.getInt32(J)));
+ Builder.getInt32(J)),
+ VecIdx++);
}
} else {
- InsertElem(I->getValueOperand());
+ InsertElem(I->getValueOperand(), VecIdx);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 4e70c15df5741..c935310584949 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -3850,8 +3850,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `ptr addrspace(1) poison`, addrspace 1)
- ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) poison`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) poison`, addrspace 1)
+ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[LOAD2]](s32)
+ ; CHECK-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p5) = G_INTTOPTR [[LOAD2]](s32)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3880,10 +3881,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
- ; CHECK-NEXT: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[INTTOPTR]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
- ; CHECK-NEXT: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
+ ; CHECK-NEXT: G_STORE [[INTTOPTR1]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 5c526c78afcd7..aaf7be9ffe112 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -13,16 +13,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
- ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg3.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
- ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc
+ ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg3.kernarg.offset.align.down + 16, align 8, addrspace 4)
+ ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 0, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc
- ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc
+ ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 8, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_CSELECT_B64 -1, 0, implicit killed $scc
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr30_sgpr31, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $vgpr5 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
@@ -33,7 +32,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1.bb103:
; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -41,7 +40,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.2:
; GFX90A-NEXT: successors: %bb.3(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF implicit-def $vgpr16
; GFX90A-NEXT: renamable $vgpr3 = IMPLICIT_DEF implicit-def $vgpr2
@@ -52,7 +51,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.3.Flow17:
; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr6 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc
@@ -60,7 +59,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.4.bb15:
; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec
@@ -359,7 +358,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.35.bb20:
; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)' 'HEAD~1' HEAD llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-loads.ll llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-redund-loads.ll llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll llvm/test/CodeGen/AMDGPU/mad_uint24.ll llvm/test/CodeGen/AMDGPU/sad.ll llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.llThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-loads.ll
Show resolved
Hide resolved
llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-loads.ll
Show resolved
Hide resolved
🐧 Linux x64 Test Results
|
dakersnar
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for addressing the feedback so far. Here's a bit more, and then I think this should be good to go.
dakersnar
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! Thanks for refining this. And if you get the chance, I would love a review of #159388 once this lands and I merge your changes into my PR :). I'll ping you when that is ready.
This is the fixed version of #163019