Skip to content

Commit

Permalink
[ARM] Implement TTI::isHardwareLoopProfitable
Browse files Browse the repository at this point in the history
    
Implement the backend target hook to drive the HardwareLoops pass.
The low-overhead branch extension for Arm M-class cores is flexible
enough that we don't have to ensure correctness at this point, except
checking that the loop counter variable can be stored in LR - a
32-bit register. For it to be profitable, we want to avoid loops that
contain function calls, or any other instruction that alters the PC.
    
This implementation uses TargetLoweringInfo, to query type and
operation actions, looks at intrinsic calls and also performs some
manual checks for remainder/division and FP operations.
    
I think this should be a good base to start and extra details can be
filled out later.

Differential Revision: https://reviews.llvm.org/D62907

llvm-svn: 363149
  • Loading branch information
sparker-arm committed Jun 12, 2019
1 parent 91bb72a commit 757ac02
Show file tree
Hide file tree
Showing 8 changed files with 1,332 additions and 0 deletions.
194 changes: 194 additions & 0 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
Expand Up @@ -36,6 +36,10 @@ using namespace llvm;

#define DEBUG_TYPE "armtti"

static cl::opt<bool> DisableLowOverheadLoops(
"disable-arm-loloops", cl::Hidden, cl::init(true),
cl::desc("Disable the generation of low-overhead loops"));

bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
Expand Down Expand Up @@ -628,6 +632,196 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
UseMaskForCond, UseMaskForGaps);
}

bool ARMTTIImpl::isLoweredToCall(const Function *F) {
if (!F->isIntrinsic())
BaseT::isLoweredToCall(F);

// Assume all Arm-specific intrinsics map to an instruction.
if (F->getName().startswith("llvm.arm"))
return false;

switch (F->getIntrinsicID()) {
default: break;
case Intrinsic::powi:
case Intrinsic::sin:
case Intrinsic::cos:
case Intrinsic::pow:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2:
case Intrinsic::exp:
case Intrinsic::exp2:
return true;
case Intrinsic::sqrt:
case Intrinsic::fabs:
case Intrinsic::copysign:
case Intrinsic::floor:
case Intrinsic::ceil:
case Intrinsic::trunc:
case Intrinsic::rint:
case Intrinsic::nearbyint:
case Intrinsic::round:
case Intrinsic::canonicalize:
case Intrinsic::lround:
case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint:
if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
return true;
if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
return true;
// Some operations can be handled by vector instructions and assume
// unsupported vectors will be expanded into supported scalar ones.
// TODO Handle scalar operations properly.
return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
case Intrinsic::masked_store:
case Intrinsic::masked_load:
case Intrinsic::masked_gather:
case Intrinsic::masked_scatter:
return !ST->hasMVEIntegerOps();
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::sadd_sat:
case Intrinsic::uadd_sat:
case Intrinsic::ssub_sat:
case Intrinsic::usub_sat:
return false;
}

return BaseT::isLoweredToCall(F);
}

bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
TTI::HardwareLoopInfo &HWLoopInfo) {
// Low-overhead branches are only supported in the 'low-overhead branch'
// extension of v8.1-m.
if (!ST->hasLOB() || DisableLowOverheadLoops)
return false;

// For now, for simplicity, only support loops with one exit block.
if (!L->getExitBlock())
return false;

if (!SE.hasLoopInvariantBackedgeTakenCount(L))
return false;

const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
return false;

const SCEV *TripCountSCEV =
SE.getAddExpr(BackedgeTakenCount,
SE.getOne(BackedgeTakenCount->getType()));

// We need to store the trip count in LR, a 32-bit register.
if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
return false;

// Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
// point in generating a hardware loop if that's going to happen.
auto MaybeCall = [this](Instruction &I) {
const ARMTargetLowering *TLI = getTLI();
unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
EVT VT = TLI->getValueType(DL, I.getType(), true);
if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
return true;

// Check if an intrinsic will be lowered to a call and assume that any
// other CallInst will generate a bl.
if (auto *Call = dyn_cast<CallInst>(&I)) {
if (isa<IntrinsicInst>(Call)) {
if (const Function *F = Call->getCalledFunction())
return isLoweredToCall(F);
}
return true;
}

// FPv5 provides conversions between integer, double-precision,
// single-precision, and half-precision formats.
switch (I.getOpcode()) {
default:
break;
case Instruction::FPToSI:
case Instruction::FPToUI:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::FPTrunc:
case Instruction::FPExt:
return !ST->hasFPARMv8Base();
}

// FIXME: Unfortunately the approach of checking the Operation Action does
// not catch all cases of Legalization that use library calls. Our
// Legalization step categorizes some transformations into library calls as
// Custom, Expand or even Legal when doing type legalization. So for now
// we have to special case for instance the SDIV of 64bit integers and the
// use of floating point emulation.
if (VT.isInteger() && VT.getSizeInBits() >= 64) {
switch (ISD) {
default:
break;
case ISD::SDIV:
case ISD::UDIV:
case ISD::SREM:
case ISD::UREM:
case ISD::SDIVREM:
case ISD::UDIVREM:
return true;
}
}

// Assume all other non-float operations are supported.
if (!VT.isFloatingPoint())
return false;

// We'll need a library call to handle most floats when using soft.
if (TLI->useSoftFloat()) {
switch (I.getOpcode()) {
default:
return true;
case Instruction::Alloca:
case Instruction::Load:
case Instruction::Store:
case Instruction::Select:
case Instruction::PHI:
return false;
}
}

// We'll need a libcall to perform double precision operations on a single
// precision only FPU.
if (I.getType()->isDoubleTy() && !ST->hasFP64())
return true;

// Likewise for half precision arithmetic.
if (I.getType()->isHalfTy() && !ST->hasFullFP16())
return true;

return false;
};

// Scan the instructions to see if there's any that we know will turn into a
// call.
for (auto *BB : L->getBlocks())
for (auto &I : *BB)
if (MaybeCall(I))
return false;

// TODO: Check whether the trip count calculation is expensive. If L is the
// inner loop but we know it has a low trip count, calculating that trip
// count (in the parent loop) may be detrimental.

LLVMContext &C = L->getHeader()->getContext();
HWLoopInfo.CounterInReg = true;
HWLoopInfo.CountType = Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
return true;
}

void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
// Only currently enable these preferences for M-Class cores.
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.h
Expand Up @@ -180,6 +180,12 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
bool UseMaskForCond = false,
bool UseMaskForGaps = false);

bool isLoweredToCall(const Function *F);
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
TTI::HardwareLoopInfo &HWLoopInfo);

void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);

Expand Down

0 comments on commit 757ac02

Please sign in to comment.