35 changes: 29 additions & 6 deletions llvm/lib/Target/AArch64/AArch64Subtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,36 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
CPUString = "generic";

ParseSubtargetFeatures(CPUString, FS);
initializeProperties();

return *this;
}

void AArch64Subtarget::initializeProperties() {
// Initialize CPU specific properties. We should add a tablegen feature for
// this in the future so we can specify it together with the subtarget
// features.
switch (ARMProcFamily) {
case Cyclone:
CacheLineSize = 64;
PrefetchDistance = 280;
MinPrefetchStride = 2048;
MaxPrefetchIterationsAhead = 3;
break;
case CortexA57:
MaxInterleaveFactor = 4;
break;
case Kryo:
MaxInterleaveFactor = 4;
VectorInsertExtractBaseCost = 2;
break;
case Others: break;
case CortexA35: break;
case CortexA53: break;
case ExynosM1: break;
}
}

AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const TargetMachine &TM, bool LittleEndian)
Expand Down Expand Up @@ -110,8 +137,7 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
// Enabling or Disabling the latency heuristic is a close call: It seems to
// help nearly no benchmark on out-of-order architectures, on the other hand
// it regresses register pressure on a few benchmarking.
if (isCyclone())
Policy.DisableLatencyHeuristic = true;
Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
}

bool AArch64Subtarget::enableEarlyIfConversion() const {
Expand All @@ -133,8 +159,5 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {

std::unique_ptr<PBQPRAConstraint>
AArch64Subtarget::getCustomPBQPConstraints() const {
if (!isCortexA57())
return nullptr;

return llvm::make_unique<A57ChainingConstraint>();
return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
}
69 changes: 58 additions & 11 deletions llvm/lib/Target/AArch64/AArch64Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ class StringRef;
class Triple;

class AArch64Subtarget : public AArch64GenSubtargetInfo {
protected:
enum ARMProcFamilyEnum {
public:
enum ARMProcFamilyEnum : uint8_t {
Others,
CortexA35,
CortexA53,
Expand All @@ -44,6 +44,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
Kryo
};

protected:
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
ARMProcFamilyEnum ARMProcFamily = Others;

Expand All @@ -66,6 +67,24 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {

// StrictAlign - Disallow unaligned memory accesses.
bool StrictAlign = false;
bool MergeNarrowLoads = false;
bool UseAA = false;
bool PredictableSelectIsExpensive = false;
bool BalanceFPOps = false;
bool CustomAsCheapAsMove = false;
bool UsePostRAScheduler = false;
bool Misaligned128StoreIsSlow = false;
bool AvoidQuadLdStPairs = false;
bool UseAlternateSExtLoadCVTF32Pattern = false;
bool HasMacroOpFusion = false;
bool DisableLatencySchedHeuristic = false;
bool UseRSqrt = false;
uint8_t MaxInterleaveFactor = 2;
uint8_t VectorInsertExtractBaseCost = 3;
uint16_t CacheLineSize = 0;
uint16_t PrefetchDistance = 0;
uint16_t MinPrefetchStride = 1;
unsigned MaxPrefetchIterationsAhead = UINT_MAX;

// ReserveX18 - X18 is not available as a general purpose register.
bool ReserveX18;
Expand Down Expand Up @@ -93,6 +112,9 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
/// subtarget initialization.
AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);

/// Initialize properties based on the selected processor family.
void initializeProperties();

public:
/// This constructor initializes the data members to match that
/// of the specified triple.
Expand Down Expand Up @@ -123,7 +145,15 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
const Triple &getTargetTriple() const { return TargetTriple; }
bool enableMachineScheduler() const override { return true; }
bool enablePostRAScheduler() const override {
return isGeneric() || isCortexA53() || isCortexA57() || isKryo();
return UsePostRAScheduler;
}

/// Returns ARM processor family.
/// Avoid this function! CPU specifics should be kept local to this class
/// and preferably modeled with SubtargetFeatures or properties in
/// initializeProperties().
ARMProcFamilyEnum getProcFamily() const {
return ARMProcFamily;
}

bool hasV8_1aOps() const { return HasV8_1aOps; }
Expand All @@ -140,6 +170,30 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
bool hasNEON() const { return HasNEON; }
bool hasCrypto() const { return HasCrypto; }
bool hasCRC() const { return HasCRC; }
bool mergeNarrowLoads() const { return MergeNarrowLoads; }
bool balanceFPOps() const { return BalanceFPOps; }
bool predictableSelectIsExpensive() const {
return PredictableSelectIsExpensive;
}
bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }
bool useAlternateSExtLoadCVTF32Pattern() const {
return UseAlternateSExtLoadCVTF32Pattern;
}
bool hasMacroOpFusion() const { return HasMacroOpFusion; }
bool useRSqrt() const { return UseRSqrt; }
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
unsigned getVectorInsertExtractBaseCost() const {
return VectorInsertExtractBaseCost;
}
unsigned getCacheLineSize() const { return CacheLineSize; }
unsigned getPrefetchDistance() const { return PrefetchDistance; }
unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
unsigned getMaxPrefetchIterationsAhead() const {
return MaxPrefetchIterationsAhead;
}

/// CPU has TBI (top byte of addresses is ignored during HW address
/// translation) and OS enables it.
bool supportsAddressTopByteIgnored() const;
Expand All @@ -160,14 +214,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }

bool isGeneric() const { return CPUString == "generic"; }
bool isCyclone() const { return CPUString == "cyclone"; }
bool isCortexA57() const { return CPUString == "cortex-a57"; }
bool isCortexA53() const { return CPUString == "cortex-a53"; }
bool isExynosM1() const { return CPUString == "exynos-m1"; }
bool isKryo() const { return CPUString == "kryo"; }

bool useAA() const override { return isCortexA53(); }
bool useAA() const override { return UseAA; }

/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
/// that still makes it profitable to inline the call.
Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,7 @@ static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST)
// (52 mantissa bits) are 2 and 3, respectively.
unsigned ExtraStepsF = 2,
ExtraStepsD = ExtraStepsF + 1;
// FIXME: Enable x^-1/2 only for Exynos M1 at the moment.
bool UseRsqrt = ST.isExynosM1();
bool UseRsqrt = ST.useRSqrt();

TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF);
TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD);
Expand Down
27 changes: 6 additions & 21 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}

// All other insert/extracts cost this much.
if (ST->isKryo())
return 2;
return 3;
return ST->getVectorInsertExtractBaseCost();
}

int AArch64TTIImpl::getArithmeticInstrCost(
Expand Down Expand Up @@ -529,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
}

unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
if (ST->isCortexA57() || ST->isKryo())
return 4;
return 2;
return ST->getMaxInterleaveFactor();
}

void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
Expand Down Expand Up @@ -630,28 +626,17 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
}

unsigned AArch64TTIImpl::getCacheLineSize() {
if (ST->isCyclone())
return 64;
return BaseT::getCacheLineSize();
return ST->getCacheLineSize();
}

unsigned AArch64TTIImpl::getPrefetchDistance() {
if (ST->isCyclone())
return 280;
return BaseT::getPrefetchDistance();
return ST->getPrefetchDistance();
}

unsigned AArch64TTIImpl::getMinPrefetchStride() {
if (ST->isCyclone())
// The HW prefetcher handles accesses with strides up to 2KB.
return 2048;
return BaseT::getMinPrefetchStride();
return ST->getMinPrefetchStride();
}

unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
if (ST->isCyclone())
// Be conservative for now and don't prefetch ahead too much since the loop
// may terminate early.
return 3;
return BaseT::getMaxPrefetchIterationsAhead();
return ST->getMaxPrefetchIterationsAhead();
}