Skip to content

Commit

Permalink
[loop-idiom] Hoist loop memcpys to loop preheader
Browse files Browse the repository at this point in the history
For a simple loop like:
```
struct S {
  int x;
  int y;
  char b;
};

unsigned foo(S* __restrict__ a, S* b, int n) {
  for (int i = 0; i < n; i++)
    a[i] = b[i];

  return sizeof(a[0]);
}
```
We could eliminate the loop and convert it to a large memcpy of 12*n bytes. Currently this is not handled. Output of `opt -loop-idiom -S < memcpy_before.ll`
```
%struct.S = type { i32, i32, i8 }

define dso_local i32 @_Z3fooP1SS0_i(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr {
entry:
  %cmp7 = icmp sgt i32 %n, 0
  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:                               ; preds = %entry
  br label %for.body

for.cond.cleanup.loopexit:                        ; preds = %for.body
  br label %for.cond.cleanup

for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
  ret i32 12

for.body:                                         ; preds = %for.body, %for.body.preheader
  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
  %idxprom = zext i32 %i.08 to i64
  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
  %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
  %0 = bitcast %struct.S* %arrayidx2 to i8*
  %1 = bitcast %struct.S* %arrayidx to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
  %inc = add nuw nsw i32 %i.08, 1
  %cmp = icmp slt i32 %inc, %n
  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
}

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #0

attributes #0 = { argmemonly nofree nosync nounwind willreturn }

```
The loop idiom pass currently only handles load and store instructions. Since struct S is too big to fit in a register, the loop body contains a memcpy intrinsic.

With this change, re-run `opt -loop-idiom -S < memcpy_before.ll`. The loop memcpy is promoted to loop preheader. For this trivial case, the loop is dead and will be removed by another pass.
```
%struct.S = type { i32, i32, i8 }

define dso_local i32 @_Z3fooP1SS0_i(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr {
entry:
  %a1 = bitcast %struct.S* %a to i8*
  %b2 = bitcast %struct.S* %b to i8*
  %cmp7 = icmp sgt i32 %n, 0
  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:                               ; preds = %entry
  %0 = zext i32 %n to i64
  %1 = mul nuw nsw i64 %0, 12
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %a1, i8* align 4 %b2, i64 %1, i1 false)
  br label %for.body

for.cond.cleanup.loopexit:                        ; preds = %for.body
  br label %for.cond.cleanup

for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
  ret i32 12

for.body:                                         ; preds = %for.body, %for.body.preheader
  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
  %idxprom = zext i32 %i.08 to i64
  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
  %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
  %2 = bitcast %struct.S* %arrayidx2 to i8*
  %3 = bitcast %struct.S* %arrayidx to i8*
  %inc = add nuw nsw i32 %i.08, 1
  %cmp = icmp slt i32 %inc, %n
  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
}

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #0

attributes #0 = { argmemonly nofree nosync nounwind willreturn }
```

Reviewed By: zino

Differential Revision: https://reviews.llvm.org/D97667
  • Loading branch information
zhuhan0 committed Apr 28, 2021
1 parent 82d3c07 commit 75d6b8b
Show file tree
Hide file tree
Showing 5 changed files with 526 additions and 24 deletions.
148 changes: 126 additions & 22 deletions llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,13 @@ class LoopIdiomRecognize {
enum class ForMemset { No, Yes };
bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
ForMemset For);

template <typename MemInst>
bool processLoopMemIntrinsic(
BasicBlock *BB,
bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
const SCEV *BECount);
bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);

bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
Expand Down Expand Up @@ -635,22 +642,10 @@ bool LoopIdiomRecognize::runOnLoopBlock(
for (auto &SI : StoreRefsForMemcpy)
MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);

for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
Instruction *Inst = &*I++;
// Look for memset instructions, which may be optimized to a larger memset.
if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
WeakTrackingVH InstPtr(&*I);
if (!processLoopMemSet(MSI, BECount))
continue;
MadeChange = true;

// If processing the memset invalidated our iterator, start over from the
// top of the block.
if (!InstPtr)
I = BB->begin();
continue;
}
}
MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
MadeChange |= processLoopMemIntrinsic<MemSetInst>(
BB, &LoopIdiomRecognize::processLoopMemSet, BECount);

return MadeChange;
}
Expand Down Expand Up @@ -799,6 +794,86 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
return Changed;
}

/// processLoopMemIntrinsic - Template function for calling different processor
/// functions based on mem instrinsic type.
template <typename MemInst>
bool LoopIdiomRecognize::processLoopMemIntrinsic(
BasicBlock *BB,
bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
const SCEV *BECount) {
bool MadeChange = false;
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
Instruction *Inst = &*I++;
// Look for memory instructions, which may be optimized to a larger one.
if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
WeakTrackingVH InstPtr(&*I);
if (!(this->*Processor)(MI, BECount))
continue;
MadeChange = true;

// If processing the instruction invalidated our iterator, start over from
// the top of the block.
if (!InstPtr)
I = BB->begin();
}
}
return MadeChange;
}

/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
const SCEV *BECount) {
// We can only handle non-volatile memcpys with a constant size.
if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
return false;

// If we're not allowed to hack on memcpy, we fail.
if (!HasMemcpy || DisableLIRP::Memcpy)
return false;

Value *Dest = MCI->getDest();
Value *Source = MCI->getSource();
if (!Dest || !Source)
return false;

// See if the load and store pointer expressions are AddRec like {base,+,1} on
// the current loop, which indicates a strided load and store. If we have
// something else, it's a random load or store we can't handle.
const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
return false;
const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
return false;

// Reject memcpys that are so large that they overflow an unsigned.
uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
if ((SizeInBytes >> 32) != 0)
return false;

// Check if the stride matches the size of the memcpy. If so, then we know
// that every byte is touched in the loop.
const SCEVConstant *StrStride =
dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
const SCEVConstant *LoadStride =
dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
if (!StrStride || !LoadStride)
return false;

APInt StrIntStride = StrStride->getAPInt();
APInt LoadIntStride = LoadStride->getAPInt();
if (SizeInBytes != StrIntStride && SizeInBytes != -StrIntStride)
return false;

// Check if the load stride matches the store stride.
if (StrIntStride != LoadIntStride && StrIntStride != -LoadIntStride)
return false;

return processLoopStoreOfLoopLoad(Dest, Source, (unsigned)SizeInBytes,
MCI->getDestAlign(), MCI->getSourceAlign(),
MCI, MCI, StoreEv, LoadEv, BECount);
}

/// processLoopMemSet - See if this memset can be promoted to a large memset.
bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
const SCEV *BECount) {
Expand All @@ -807,7 +882,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
return false;

// If we're not allowed to hack on memset, we fail.
if (!HasMemset)
if (!HasMemset || DisableLIRP::Memset)
return false;

Value *Pointer = MSI->getDest();
Expand Down Expand Up @@ -1047,9 +1122,11 @@ bool LoopIdiomRecognize::processLoopStridedStore(
ORE.emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
NewCall->getDebugLoc(), Preheader)
<< "Transformed loop-strided store into a call to "
<< "Transformed loop-strided store in "
<< ore::NV("Function", TheStore->getFunction())
<< " function into a call to "
<< ore::NV("NewFunction", NewCall->getCalledFunction())
<< "() function";
<< "() intrinsic";
});

// Okay, the memset has been formed. Zap the original store and anything that
Expand Down Expand Up @@ -1137,9 +1214,22 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(

SmallPtrSet<Instruction *, 1> Stores;
Stores.insert(TheStore);

bool IsMemCpy = isa<MemCpyInst>(TheStore);
const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";

if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
StoreSize, *AA, Stores))
StoreSize, *AA, Stores)) {
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
TheStore)
<< ore::NV("Inst", InstRemark) << " in "
<< ore::NV("Function", TheStore->getFunction())
<< " function will not be hoisted: "
<< ore::NV("Reason", "The loop may access store location");
});
return Changed;
}

const SCEV *LdStart = LoadEv->getStart();
unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
Expand All @@ -1153,9 +1243,21 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
Value *LoadBasePtr = Expander.expandCodeFor(
LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());

// If the store is a memcpy instruction, we must check if it will write to
// the load memory locations. So remove it from the ignored stores.
if (IsMemCpy)
Stores.erase(TheStore);
if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
StoreSize, *AA, Stores))
StoreSize, *AA, Stores)) {
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
<< ore::NV("Inst", InstRemark) << " in "
<< ore::NV("Function", TheStore->getFunction())
<< " function will not be hoisted: "
<< ore::NV("Reason", "The loop may access load location");
});
return Changed;
}

if (avoidLIRForMultiBlockLoop())
return Changed;
Expand Down Expand Up @@ -1216,7 +1318,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
NewCall->getDebugLoc(), Preheader)
<< "Formed a call to "
<< ore::NV("NewFunction", NewCall->getCalledFunction())
<< "() function";
<< "() intrinsic from " << ore::NV("Inst", InstRemark)
<< " instruction in " << ore::NV("Function", TheStore->getFunction())
<< " function";
});

// Okay, the memcpy has been formed. Zap the original store and anything that
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"

; Check that everything still works when debuginfo is present, and that it is reasonably propagated.

; CHECK: remark: <stdin>:6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() function
; CHECK: remark: <stdin>:6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() intrinsic from load and store instruction in test6_dest_align function

define void @test6_dest_align(i32* noalias align 1 %Base, i32* noalias align 4 %Dest, i64 %Size) nounwind ssp {
; CHECK-LABEL: @test6_dest_align(
Expand Down
89 changes: 89 additions & 0 deletions llvm/test/Transforms/LoopIdiom/memcpy-intrinsic-different-types.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-idiom < %s -S | FileCheck %s

; #include <vector>
;
; class SDValue {
; int A;
; int B;
; unsigned C;
; };
;
; class SDUse {
; SDValue Val;
; SDUse **Prev = nullptr;
; SDUse *Next = nullptr;
;
; public:
; operator const SDValue&() const { return Val; }
; };
;
; void foo(SDUse *S, int N) {
; // Should not hoist memcpy because source and destination are of different types
; std::vector<SDValue> Ops(S, S + N);
; }

; ModuleID = 'different_types.cpp'
source_filename = "different_types.cpp"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%class.SDUse = type { %class.SDValue, %class.SDUse**, %class.SDUse* }
%class.SDValue = type { i32, i32, i32 }

declare dso_local i32 @__gxx_personality_v0(...)

; Function Attrs: uwtable mustprogress
define linkonce_odr dso_local %class.SDValue* @_ZNSt20__uninitialized_copyILb0EE13__uninit_copyIP5SDUseP7SDValueEET0_T_S7_S6_(%class.SDUse* %__first, %class.SDUse* %__last, %class.SDValue* %__result) local_unnamed_addr #0 align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
; CHECK-LABEL: @_ZNSt20__uninitialized_copyILb0EE13__uninit_copyIP5SDUseP7SDValueEET0_T_S7_S6_(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP_NOT15:%.*]] = icmp eq %class.SDUse* [[__FIRST:%.*]], [[__LAST:%.*]]
; CHECK-NEXT: br i1 [[CMP_NOT15]], label [[FOR_END:%.*]], label [[FOR_INC_PREHEADER:%.*]]
; CHECK: for.inc.preheader:
; CHECK-NEXT: br label [[FOR_INC:%.*]]
; CHECK: for.inc:
; CHECK-NEXT: [[__CUR_017:%.*]] = phi %class.SDValue* [ [[INCDEC_PTR1:%.*]], [[FOR_INC]] ], [ [[__RESULT:%.*]], [[FOR_INC_PREHEADER]] ]
; CHECK-NEXT: [[__FIRST_ADDR_016:%.*]] = phi %class.SDUse* [ [[INCDEC_PTR:%.*]], [[FOR_INC]] ], [ [[__FIRST]], [[FOR_INC_PREHEADER]] ]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast %class.SDValue* [[__CUR_017]] to i8*
; CHECK-NEXT: [[TMP1:%.*]] = bitcast %class.SDUse* [[__FIRST_ADDR_016]] to i8*
; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) [[TMP0]], i8* noundef nonnull align 8 dereferenceable(12) [[TMP1]], i64 12, i1 false)
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds [[CLASS_SDUSE:%.*]], %class.SDUse* [[__FIRST_ADDR_016]], i64 1
; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds [[CLASS_SDVALUE:%.*]], %class.SDValue* [[__CUR_017]], i64 1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq %class.SDUse* [[INCDEC_PTR]], [[__LAST]]
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_INC]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: [[INCDEC_PTR1_LCSSA:%.*]] = phi %class.SDValue* [ [[INCDEC_PTR1]], [[FOR_INC]] ]
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
; CHECK-NEXT: [[__CUR_0_LCSSA:%.*]] = phi %class.SDValue* [ [[__RESULT]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR1_LCSSA]], [[FOR_END_LOOPEXIT]] ]
; CHECK-NEXT: ret %class.SDValue* [[__CUR_0_LCSSA]]
;
entry:
%cmp.not15 = icmp eq %class.SDUse* %__first, %__last
br i1 %cmp.not15, label %for.end, label %for.inc.preheader

for.inc.preheader: ; preds = %entry
br label %for.inc

for.inc: ; preds = %for.inc.preheader, %for.inc
%__cur.017 = phi %class.SDValue* [ %incdec.ptr1, %for.inc ], [ %__result, %for.inc.preheader ]
%__first.addr.016 = phi %class.SDUse* [ %incdec.ptr, %for.inc ], [ %__first, %for.inc.preheader ]
%0 = bitcast %class.SDValue* %__cur.017 to i8*
%1 = bitcast %class.SDUse* %__first.addr.016 to i8*
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) %0, i8* noundef nonnull align 8 dereferenceable(12) %1, i64 12, i1 false)
%incdec.ptr = getelementptr inbounds %class.SDUse, %class.SDUse* %__first.addr.016, i64 1
%incdec.ptr1 = getelementptr inbounds %class.SDValue, %class.SDValue* %__cur.017, i64 1
%cmp.not = icmp eq %class.SDUse* %incdec.ptr, %__last
br i1 %cmp.not, label %for.end.loopexit, label %for.inc

for.end.loopexit: ; preds = %for.inc
%incdec.ptr1.lcssa = phi %class.SDValue* [ %incdec.ptr1, %for.inc ]
br label %for.end

for.end: ; preds = %for.end.loopexit, %entry
%__cur.0.lcssa = phi %class.SDValue* [ %__result, %entry ], [ %incdec.ptr1.lcssa, %for.end.loopexit ]
ret %class.SDValue* %__cur.0.lcssa
}

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
Loading

0 comments on commit 75d6b8b

Please sign in to comment.