Skip to content

Commit

Permalink
[DSE] Add support for not aligned begin/end
Browse files Browse the repository at this point in the history
This is an attempt to improve handling of partial overlaps in case of unaligned begin\end.

Existing implementation just bails out if it encounters such cases. Even when it doesn't I believe existing code checking alignment constraints is not quite correct. It tries to ensure alignment of the "later" start/end offset while should be preserving relative alignment between earlier and later start/end.

The idea behind the change is simple. When start/end is not aligned as we wish instead of bailing out let's adjust it as necessary to get desired alignment.

I'll update with performance results as measured by the test-suite...it's still running...

Reviewed By: fhahn

Differential Revision: https://reviews.llvm.org/D93530
  • Loading branch information
Evgeniy Brevnov committed Mar 4, 2021
1 parent 21427b8 commit e94125f
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 38 deletions.
88 changes: 62 additions & 26 deletions llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
Expand Up @@ -1067,58 +1067,94 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
return MadeChange;
}

static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
uint64_t &EarlierSize, int64_t LaterOffset,
static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierStart,
uint64_t &EarlierSize, int64_t LaterStart,
uint64_t LaterSize, bool IsOverwriteEnd) {
// TODO: base this on the target vector size so that if the earlier
// store was too small to get vector writes anyway then its likely
// a good idea to shorten it
// Power of 2 vector writes are probably always a bad idea to optimize
// as any store/memset/memcpy is likely using vector instructions so
// shortening it to not vector size is likely to be slower
auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite);
unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment();
if (!IsOverwriteEnd)
LaterOffset = int64_t(LaterOffset + LaterSize);

if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
!((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
return false;
Align PrefAlign = EarlierIntrinsic->getDestAlign().valueOrOne();

// We assume that memet/memcpy operates in chunks of the "largest" native
// type size and aligned on the same value. That means optimal start and size
// of memset/memcpy should be modulo of preferred alignment of that type. That
// is it there is no any sense in trying to reduce store size any further
// since any "extra" stores comes for free anyway.
// On the other hand, maximum alignment we can achieve is limited by alignment
// of initial store.

// TODO: Limit maximum alignment by preferred (or abi?) alignment of the
// "largest" native type.
// Note: What is the proper way to get that value?
// Should TargetTransformInfo::getRegisterBitWidth be used or anything else?
// PrefAlign = std::min(DL.getPrefTypeAlign(LargestType), PrefAlign);

int64_t ToRemoveStart = 0;
uint64_t ToRemoveSize = 0;
// Compute start and size of the region to remove. Make sure 'PrefAlign' is
// maintained on the remaining store.
if (IsOverwriteEnd) {
// Calculate required adjustment for 'LaterStart'in order to keep remaining
// store size aligned on 'PerfAlign'.
uint64_t Off =
offsetToAlignment(uint64_t(LaterStart - EarlierStart), PrefAlign);
ToRemoveStart = LaterStart + Off;
if (EarlierSize <= uint64_t(ToRemoveStart - EarlierStart))
return false;
ToRemoveSize = EarlierSize - uint64_t(ToRemoveStart - EarlierStart);
} else {
ToRemoveStart = EarlierStart;
assert(LaterSize >= uint64_t(EarlierStart - LaterStart) &&
"Not overlapping accesses?");
ToRemoveSize = LaterSize - uint64_t(EarlierStart - LaterStart);
// Calculate required adjustment for 'ToRemoveSize'in order to keep
// start of the remaining store aligned on 'PerfAlign'.
uint64_t Off = offsetToAlignment(ToRemoveSize, PrefAlign);
if (Off != 0) {
if (ToRemoveSize <= (PrefAlign.value() - Off))
return false;
ToRemoveSize -= PrefAlign.value() - Off;
}
assert(isAligned(PrefAlign, ToRemoveSize) &&
"Should preserve selected alignment");
}

int64_t NewLength = IsOverwriteEnd
? LaterOffset - EarlierOffset
: EarlierSize - (LaterOffset - EarlierOffset);
assert(ToRemoveSize > 0 && "Shouldn't reach here if nothing to remove");
assert(EarlierSize > ToRemoveSize && "Can't remove more than original size");

uint64_t NewSize = EarlierSize - ToRemoveSize;
if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) {
// When shortening an atomic memory intrinsic, the newly shortened
// length must remain an integer multiple of the element size.
const uint32_t ElementSize = AMI->getElementSizeInBytes();
if (0 != NewLength % ElementSize)
if (0 != NewSize % ElementSize)
return false;
}

LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
<< (IsOverwriteEnd ? "END" : "BEGIN") << ": "
<< *EarlierWrite << "\n KILLER (offset " << LaterOffset
<< ", " << EarlierSize << ")\n");
<< *EarlierWrite << "\n KILLER [" << ToRemoveStart << ", "
<< int64_t(ToRemoveStart + ToRemoveSize) << ")\n");

Value *EarlierWriteLength = EarlierIntrinsic->getLength();
Value *TrimmedLength =
ConstantInt::get(EarlierWriteLength->getType(), NewLength);
ConstantInt::get(EarlierWriteLength->getType(), NewSize);
EarlierIntrinsic->setLength(TrimmedLength);
EarlierIntrinsic->setDestAlignment(PrefAlign);

EarlierSize = NewLength;
if (!IsOverwriteEnd) {
int64_t OffsetMoved = (LaterOffset - EarlierOffset);
Value *Indices[1] = {
ConstantInt::get(EarlierWriteLength->getType(), OffsetMoved)};
ConstantInt::get(EarlierWriteLength->getType(), ToRemoveSize)};
GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds(
EarlierIntrinsic->getRawDest()->getType()->getPointerElementType(),
EarlierIntrinsic->getRawDest(), Indices, "", EarlierWrite);
NewDestGEP->setDebugLoc(EarlierIntrinsic->getDebugLoc());
EarlierIntrinsic->setDest(NewDestGEP);
EarlierOffset = EarlierOffset + OffsetMoved;
}

// Finally update start and size of earlier access.
if (!IsOverwriteEnd)
EarlierStart += ToRemoveSize;
EarlierSize = NewSize;

return true;
}

Expand Down
Expand Up @@ -234,12 +234,13 @@ entry:
ret void
}

define void @dontwrite2to9(i32* nocapture %p) {
; CHECK-LABEL: @dontwrite2to9(
define void @write2to10(i32* nocapture %p) {
; CHECK-LABEL: @write2to10(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i1 false)
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i1 false)
; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16*
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1
; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64*
Expand All @@ -257,12 +258,13 @@ entry:
ret void
}

define void @dontwrite2to9_atomic(i32* nocapture %p) {
; CHECK-LABEL: @dontwrite2to9_atomic(
define void @write2to10_atomic(i32* nocapture %p) {
; CHECK-LABEL: @write2to10_atomic(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4)
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i32 4)
; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16*
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1
; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64*
Expand Down Expand Up @@ -391,3 +393,55 @@ entry:
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture, i8, i64, i32) nounwind

define void @ow_begin_align1(i8* nocapture %p) {
; CHECK-LABEL: @ow_begin_align1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 7
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP0]], i8 0, i64 25, i1 false)
; CHECK-NEXT: [[P2:%.*]] = bitcast i8* [[P]] to i64*
; CHECK-NEXT: store i64 1, i64* [[P2]], align 1
; CHECK-NEXT: ret void
;
entry:
%p1 = getelementptr inbounds i8, i8* %p, i64 1
call void @llvm.memset.p0i8.i64(i8* align 1 %p1, i8 0, i64 32, i1 false)
%p2 = bitcast i8* %p to i64*
store i64 1, i64* %p2, align 1
ret void
}

define void @ow_end_align4(i8* nocapture %p) {
; CHECK-LABEL: @ow_end_align4(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 4
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i1 false)
; CHECK-NEXT: [[P2:%.*]] = bitcast i8* [[P]] to i64*
; CHECK-NEXT: store i64 1, i64* [[P2]], align 1
; CHECK-NEXT: ret void
;
entry:
%p1 = getelementptr inbounds i8, i8* %p, i64 1
call void @llvm.memset.p0i8.i64(i8* align 4 %p1, i8 0, i64 32, i1 false)
%p2 = bitcast i8* %p to i64*
store i64 1, i64* %p2, align 1
ret void
}

define void @ow_end_align8(i8* nocapture %p) {
; CHECK-LABEL: @ow_end_align8(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[P1]], i8 0, i64 32, i1 false)
; CHECK-NEXT: [[P2:%.*]] = bitcast i8* [[P]] to i64*
; CHECK-NEXT: store i64 1, i64* [[P2]], align 1
; CHECK-NEXT: ret void
;
entry:
%p1 = getelementptr inbounds i8, i8* %p, i64 1
call void @llvm.memset.p0i8.i64(i8* align 8 %p1, i8 0, i64 32, i1 false)
%p2 = bitcast i8* %p to i64*
store i64 1, i64* %p2, align 1
ret void
}
Expand Up @@ -388,3 +388,61 @@ entry:
store i64 3, i64* %base64_3, align 8
ret void
}

define void @ow_end_align1(i8* nocapture %p) {
; CHECK-LABEL: @ow_end_align1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[P1]], i8 0, i64 27, i1 false)
; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 27
; CHECK-NEXT: [[P2_I64:%.*]] = bitcast i8* [[P2]] to i64*
; CHECK-NEXT: store i64 1, i64* [[P2_I64]], align 1
; CHECK-NEXT: ret void
;
entry:
%p1 = getelementptr inbounds i8, i8* %p, i64 1
call void @llvm.memset.p0i8.i64(i8* align 1 %p1, i8 0, i64 32, i1 false)
%p2 = getelementptr inbounds i8, i8* %p1, i64 27
%p2.i64 = bitcast i8* %p2 to i64*
store i64 1, i64* %p2.i64, align 1
ret void
}

define void @ow_end_align4(i8* nocapture %p) {
; CHECK-LABEL: @ow_end_align4(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P1]], i8 0, i64 28, i1 false)
; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 27
; CHECK-NEXT: [[P2_I64:%.*]] = bitcast i8* [[P2]] to i64*
; CHECK-NEXT: store i64 1, i64* [[P2_I64]], align 1
; CHECK-NEXT: ret void
;
entry:
%p1 = getelementptr inbounds i8, i8* %p, i64 1
call void @llvm.memset.p0i8.i64(i8* align 4 %p1, i8 0, i64 32, i1 false)
%p2 = getelementptr inbounds i8, i8* %p1, i64 27
%p2.i64 = bitcast i8* %p2 to i64*
store i64 1, i64* %p2.i64, align 1
ret void
}

define void @ow_end_align8(i8* nocapture %p) {
; CHECK-LABEL: @ow_end_align8(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[P1]], i8 0, i64 32, i1 false)
; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 27
; CHECK-NEXT: [[P2_I64:%.*]] = bitcast i8* [[P2]] to i64*
; CHECK-NEXT: store i64 1, i64* [[P2_I64]], align 1
; CHECK-NEXT: ret void
;
entry:
%p1 = getelementptr inbounds i8, i8* %p, i64 1
call void @llvm.memset.p0i8.i64(i8* align 8 %p1, i8 0, i64 32, i1 false)
%p2 = getelementptr inbounds i8, i8* %p1, i64 27
%p2.i64 = bitcast i8* %p2 to i64*
store i64 1, i64* %p2.i64, align 1
ret void
}

Expand Up @@ -234,12 +234,13 @@ entry:
ret void
}

define void @dontwrite2to9(i32* nocapture %p) {
; CHECK-LABEL: @dontwrite2to9(
define void @write2to10(i32* nocapture %p) {
; CHECK-LABEL: @write2to10(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i1 false)
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i1 false)
; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16*
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1
; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64*
Expand All @@ -257,12 +258,13 @@ entry:
ret void
}

define void @dontwrite2to9_atomic(i32* nocapture %p) {
; CHECK-LABEL: @dontwrite2to9_atomic(
define void @write2to10_atomic(i32* nocapture %p) {
; CHECK-LABEL: @write2to10_atomic(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4)
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i32 4)
; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16*
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1
; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64*
Expand Down

0 comments on commit e94125f

Please sign in to comment.