Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ LLVM_ABI bool expandMemMoveAsLoop(MemMoveInst *MemMove,
const TargetTransformInfo &TTI);

/// Expand \p MemSet as a loop. \p MemSet is not deleted.
LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet);
LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet,
const TargetTransformInfo &TTI);

/// Expand \p MemSetPattern as a loop. \p MemSet is not deleted.
LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet);
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
canEmitLibcall(TM, ParentFunc, RTLIB::MEMSET))
break;

expandMemSetAsLoop(Memset);
expandMemSetAsLoop(Memset, TTI);
Changed = true;
Memset->eraseFromParent();
}
Expand All @@ -384,7 +384,9 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
if (isa<ConstantInt>(Memset->getLength()))
break;

expandMemSetAsLoop(Memset);
Function *ParentFunc = Memset->getFunction();
const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
expandMemSetAsLoop(Memset, TTI);
Changed = true;
Memset->eraseFromParent();
break;
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -635,7 +635,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst(
MemSetInst &MSI) {
if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
return false;
llvm::expandMemSetAsLoop(&MSI);
llvm::expandMemSetAsLoop(&MSI,
TM->getTargetTransformInfo(*MSI.getFunction()));
MSI.eraseFromParent();
return true;
}
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ static cl::opt<size_t> InlineMaxBB(
static cl::opt<unsigned> MemcpyLoopUnroll(
"amdgpu-memcpy-loop-unroll",
cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
"operations when lowering memcpy as a loop"),
"operations when lowering statically-sized memcpy, memmove, or"
"memset as a loop"),
cl::init(16), cl::Hidden);

static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
} else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
expandMemMoveAsLoop(Memmove, TTI);
} else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
expandMemSetAsLoop(Memset);
expandMemSetAsLoop(Memset, TTI);
}
MemCall->eraseFromParent();
}
Expand Down
11 changes: 7 additions & 4 deletions llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "SPIRVTargetMachine.h"
#include "SPIRVUtils.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/IRBuilder.h"
Expand Down Expand Up @@ -93,7 +94,8 @@ static Function *getOrCreateFunction(Module *M, Type *RetTy,
return NewF;
}

static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) {
static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic,
const TargetTransformInfo &TTI) {
// For @llvm.memset.* intrinsic cases with constant value and length arguments
// are emulated via "storing" a constant array to the destination. For other
// cases we wrap the intrinsic in @spirv.llvm_memset_* function and expand the
Expand Down Expand Up @@ -137,7 +139,7 @@ static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) {
auto *MemSet = IRB.CreateMemSet(Dest, Val, Len, MSI->getDestAlign(),
MSI->isVolatile());
IRB.CreateRetVoid();
expandMemSetAsLoop(cast<MemSetInst>(MemSet));
expandMemSetAsLoop(cast<MemSetInst>(MemSet), TTI);
MemSet->eraseFromParent();
break;
}
Expand Down Expand Up @@ -399,6 +401,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
bool Changed = false;
const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F);
SmallVector<Instruction *> EraseFromParent;
const TargetTransformInfo &TTI = TM.getTargetTransformInfo(*F);
for (BasicBlock &BB : *F) {
for (Instruction &I : make_early_inc_range(BB)) {
auto Call = dyn_cast<CallInst>(&I);
Expand All @@ -411,7 +414,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
switch (II->getIntrinsicID()) {
case Intrinsic::memset:
case Intrinsic::bswap:
Changed |= lowerIntrinsicToFunction(II);
Changed |= lowerIntrinsicToFunction(II, TTI);
break;
case Intrinsic::fshl:
case Intrinsic::fshr:
Expand Down Expand Up @@ -459,7 +462,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
return false;
return II->getCalledFunction()->getName().starts_with(Prefix);
}))
Changed |= lowerIntrinsicToFunction(II);
Changed |= lowerIntrinsicToFunction(II, TTI);
break;
}
}
Expand Down
204 changes: 197 additions & 7 deletions llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -930,9 +930,187 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
}
}

/// Create a Value of \p DstType that consists of a sequence of copies of
/// \p SetValue, using bitcasts and a vector splat.
static Value *createMemSetSplat(const DataLayout &DL, IRBuilderBase &B,
Value *SetValue, Type *DstType) {
unsigned DstSize = DL.getTypeStoreSize(DstType);
Type *SetValueType = SetValue->getType();
unsigned SetValueSize = DL.getTypeStoreSize(SetValueType);
assert(SetValueSize == DL.getTypeAllocSize(SetValueType) &&
"Store size and alloc size of SetValue's type must match");
assert(SetValueSize != 0 && DstSize % SetValueSize == 0 &&
"DstType size must be a multiple of SetValue size");

Value *Result = SetValue;
if (DstSize != SetValueSize) {
if (!SetValueType->isIntegerTy() && !SetValueType->isFloatingPointTy()) {
// If the type cannot be put into a vector, bitcast to iN first.
LLVMContext &Ctx = SetValue->getContext();
Result = B.CreateBitCast(Result, Type::getIntNTy(Ctx, SetValueSize * 8),
"setvalue.toint");
}
// Form a sufficiently large vector consisting of SetValue, repeated.
Result =
B.CreateVectorSplat(DstSize / SetValueSize, Result, "setvalue.splat");
}

// The value has the right size, but we might have to bitcast it to the right
// type.
if (Result->getType() != DstType) {
Result = B.CreateBitCast(Result, DstType, "setvalue.splat.cast");
}
return Result;
}

static void createMemSetLoopKnownSize(Instruction *InsertBefore, Value *DstAddr,
ConstantInt *Len, Value *SetValue,
Align DstAlign, bool IsVolatile,
const TargetTransformInfo &TTI) {
// No need to expand zero length memsets.
if (Len->isZero())
return;

BasicBlock *PreLoopBB = InsertBefore->getParent();
Function *ParentFunc = PreLoopBB->getParent();
const DataLayout &DL = ParentFunc->getDataLayout();
LLVMContext &Ctx = PreLoopBB->getContext();

unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();

Type *TypeOfLen = Len->getType();
Type *Int8Type = Type::getInt8Ty(Ctx);
assert(SetValue->getType() == Int8Type && "Can only set bytes");

// Use the same memory access type as for a memcpy with the same Dst and Src
// alignment and address space.
Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);

uint64_t LoopEndCount = alignDown(Len->getZExtValue(), LoopOpSize);

if (LoopEndCount != 0) {
Value *SplatSetValue = nullptr;
{
IRBuilder<> PreLoopBuilder(InsertBefore);
SplatSetValue =
createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType);
}

// Don't generate a residual loop, the remaining bytes are set with
// straight-line code.
LoopExpansionInfo LEI =
insertLoopExpansion(InsertBefore, Len, LoopOpSize, 0, "static-memset");

// Fill MainLoopBB
IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));

Value *DstGEP =
MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);

MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
IsVolatile);

assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
"No residual loop was requested");
}

uint64_t BytesSet = LoopEndCount;
uint64_t RemainingBytes = Len->getZExtValue() - BytesSet;
if (RemainingBytes == 0)
return;

IRBuilder<> RBuilder(InsertBefore);

SmallVector<Type *, 5> RemainingOps;
TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
DstAS, DstAS, DstAlign, DstAlign,
std::nullopt);

Type *PreviousOpTy = nullptr;
Value *SplatSetValue = nullptr;
for (auto *OpTy : RemainingOps) {
unsigned OperandSize = DL.getTypeStoreSize(OpTy);
Align PartDstAlign(commonAlignment(DstAlign, BytesSet));

// Avoid recomputing the splat SetValue if it's the same as for the last
// iteration.
if (OpTy != PreviousOpTy)
SplatSetValue = createMemSetSplat(DL, RBuilder, SetValue, OpTy);

Value *DstGEP = RBuilder.CreateInBoundsGEP(
Int8Type, DstAddr, ConstantInt::get(TypeOfLen, BytesSet));
RBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
IsVolatile);
BytesSet += OperandSize;
PreviousOpTy = OpTy;
}
assert(BytesSet == Len->getZExtValue() &&
"Bytes set should match size in the call!");
}

static void createMemSetLoopUnknownSize(Instruction *InsertBefore,
Value *DstAddr, Value *Len,
Value *SetValue, Align DstAlign,
bool IsVolatile,
const TargetTransformInfo &TTI) {
BasicBlock *PreLoopBB = InsertBefore->getParent();
Function *ParentFunc = PreLoopBB->getParent();
const DataLayout &DL = ParentFunc->getDataLayout();
LLVMContext &Ctx = PreLoopBB->getContext();

unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();

Type *Int8Type = Type::getInt8Ty(Ctx);
assert(SetValue->getType() == Int8Type && "Can only set bytes");

Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);

Type *ResidualLoopOpType = Int8Type;
unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);

Value *SplatSetValue = SetValue;
{
IRBuilder<> PreLoopBuilder(InsertBefore);
SplatSetValue = createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType);
}

LoopExpansionInfo LEI = insertLoopExpansion(
InsertBefore, Len, LoopOpSize, ResidualLoopOpSize, "dynamic-memset");

// Fill MainLoopBB
IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));

Value *DstGEP =
MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
IsVolatile);

// Fill ResidualLoopBB
if (!LEI.ResidualLoopIP)
return;

Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));

IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);

Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
LEI.ResidualLoopIndex);
ResLoopBuilder.CreateAlignedStore(SetValue, ResDstGEP, ResDstAlign,
IsVolatile);
}

static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
Value *CopyLen, Value *SetValue, Align DstAlign,
bool IsVolatile) {
// Currently no longer used for memset, only for memset.pattern.
// TODO: Update the memset.pattern lowering to also use the loop expansion
// framework and remove this function.
Type *TypeOfCopyLen = CopyLen->getType();
BasicBlock *OrigBB = InsertBefore->getParent();
Function *F = OrigBB->getParent();
Expand Down Expand Up @@ -1067,13 +1245,25 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
return true;
}

void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
createMemSetLoop(/* InsertBefore */ Memset,
/* DstAddr */ Memset->getRawDest(),
/* CopyLen */ Memset->getLength(),
/* SetValue */ Memset->getValue(),
/* Alignment */ Memset->getDestAlign().valueOrOne(),
Memset->isVolatile());
void llvm::expandMemSetAsLoop(MemSetInst *Memset,
const TargetTransformInfo &TTI) {
if (ConstantInt *CI = dyn_cast<ConstantInt>(Memset->getLength())) {
createMemSetLoopKnownSize(
/* InsertBefore */ Memset,
/* DstAddr */ Memset->getRawDest(),
/* Len */ CI,
/* SetValue */ Memset->getValue(),
/* DstAlign */ Memset->getDestAlign().valueOrOne(),
Memset->isVolatile(), TTI);
} else {
createMemSetLoopUnknownSize(
/* InsertBefore */ Memset,
/* DstAddr */ Memset->getRawDest(),
/* Len */ Memset->getLength(),
/* SetValue */ Memset->getValue(),
/* DstAlign */ Memset->getDestAlign().valueOrOne(),
Memset->isVolatile(), TTI);
}
}

void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) {
Expand Down
Loading