Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 26 additions & 50 deletions llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -626,35 +626,26 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
std::vector<Chain> Ret;
Ret.push_back({C.front()});

unsigned ElemBytes = DL.getTypeStoreSize(getChainElemTy(C));
APInt PrevReadEnd = C[0].OffsetFromLeader +
DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
auto &CurChain = Ret.back();
unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
const ChainElem &Prev = CurChain.back();
unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
"collectEquivalenceClass");
APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;

// Add this instruction to the end of the current chain, or start a new one.
assert(SzBytes % ElemBytes == 0);
APInt ReadEnd = It->OffsetFromLeader + SzBytes;
// Allow redundancy: partial or full overlap counts as contiguous.
bool AreContiguous = false;
if (It->OffsetFromLeader.sle(PrevReadEnd)) {
uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
if (Overlap % ElemBytes == 0)
AreContiguous = true;
}

LLVM_DEBUG(dbgs() << "LSV: Instruction is "
<< (AreContiguous ? "contiguous" : "chain-breaker")
<< *It->Inst << " (starts at offset "
bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
LLVM_DEBUG(dbgs() << "LSV: Instructions are "
<< (AreContiguous ? "" : "not ") << "contiguous: "
<< *Prev.Inst << " (ends at offset " << PrevReadEnd
<< ") -> " << *It->Inst << " (starts at offset "
<< It->OffsetFromLeader << ")\n");

if (AreContiguous)
CurChain.push_back(*It);
else
Ret.push_back({*It});
PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}

// Filter out length-1 chains, these are uninteresting.
Expand Down Expand Up @@ -883,24 +874,15 @@ bool Vectorizer::vectorizeChain(Chain &C) {
Type *VecElemTy = getChainElemTy(C);
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
unsigned ChainBytes = BytesAdded;
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
APInt ReadEnd = It->OffsetFromLeader + SzBytes;
// Update ChainBytes considering possible overlap.
BytesAdded =
PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
ChainBytes += BytesAdded;
PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}

unsigned ChainBytes = std::accumulate(
C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
});
assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
// than 1 byte (e.g. VecTy == <32 x i1>).
unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
Type *VecTy = FixedVectorType::get(VecElemTy, NumElem);
Type *VecTy = FixedVectorType::get(
VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));

Align Alignment = getLoadStoreAlignment(C[0].Inst);
// If this is a load/store of an alloca, we might have upgraded the alloca's
Expand All @@ -927,31 +909,27 @@ bool Vectorizer::vectorizeChain(Chain &C) {
llvm::min_element(C, [](const auto &A, const auto &B) {
return A.Inst->comesBefore(B.Inst);
})->Inst);
// This can happen due to a chain of redundant loads.
// In this case, just use the element-type, and avoid ExtractElement.
if (NumElem == 1)
VecTy = VecElemTy;

// Chain is in offset order, so C[0] is the instr with the lowest offset,
// i.e. the root of the vector.
VecInst = Builder.CreateAlignedLoad(VecTy,
getLoadStorePointerOperand(C[0].Inst),
Alignment);

unsigned VecIdx = 0;
for (const ChainElem &E : C) {
Instruction *I = E.Inst;
Value *V;
Type *T = getLoadStoreType(I);
int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (auto *VT = dyn_cast<FixedVectorType>(T)) {
auto Mask = llvm::to_vector<8>(
llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
} else if (VecTy != VecElemTy) {
VecIdx += VT->getNumElements();
} else {
V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
I->getName());
} else {
V = VecInst;
++VecIdx;
}
if (V->getType() != I->getType())
V = Builder.CreateBitOrPointerCast(V, I->getType());
Expand Down Expand Up @@ -986,24 +964,22 @@ bool Vectorizer::vectorizeChain(Chain &C) {

// Build the vector to store.
Value *Vec = PoisonValue::get(VecTy);
auto InsertElem = [&](Value *V, unsigned VecIdx) {
unsigned VecIdx = 0;
auto InsertElem = [&](Value *V) {
if (V->getType() != VecElemTy)
V = Builder.CreateBitOrPointerCast(V, VecElemTy);
Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
};
for (const ChainElem &E : C) {
auto *I = cast<StoreInst>(E.Inst);
int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (FixedVectorType *VT =
dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
Builder.getInt32(J)),
VecIdx++);
Builder.getInt32(J)));
}
} else {
InsertElem(I->getValueOperand(), VecIdx);
InsertElem(I->getValueOperand());
}
}

Expand Down
9 changes: 4 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3850,9 +3850,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) poison`, addrspace 1)
; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[LOAD2]](s32)
; CHECK-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p5) = G_INTTOPTR [[LOAD2]](s32)
; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `ptr addrspace(1) poison`, addrspace 1)
; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) poison`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
Expand Down Expand Up @@ -3881,10 +3880,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
; CHECK-NEXT: G_STORE [[INTTOPTR]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
; CHECK-NEXT: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
; CHECK-NEXT: G_STORE [[INTTOPTR1]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)
Expand Down
Loading
Loading