Skip to content

Commit

Permalink
[AArch64][Falkor] Avoid HW prefetcher tag collisions (step 1)
Browse files Browse the repository at this point in the history
Summary:
This patch is the first step in reducing HW prefetcher instruction tag
collisions in inner loops for Falkor.  It adds a pass that annotates IR
loads with metadata to indicate that they are known to be strided loads,
and adds a target lowering hook that translates this metadata to a
target-specific MachineMemOperand flag.

A follow on change will use this MachineMemOperand flag to re-write
instructions to reduce tag collisions.

Reviewers: mcrosier, t.p.northover

Subscribers: aemerson, rengolin, mgorny, javed.absar, kristof.beyls, llvm-commits

Differential Revision: https://reviews.llvm.org/D34963

llvm-svn: 308059
  • Loading branch information
geoffberry committed Jul 14, 2017
1 parent 2b9b9c8 commit b1e8714
Show file tree
Hide file tree
Showing 10 changed files with 299 additions and 6 deletions.
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64.h
Expand Up @@ -44,6 +44,7 @@ ModulePass *createAArch64PromoteConstantPass();
FunctionPass *createAArch64ConditionOptimizerPass();
FunctionPass *createAArch64A57FPLoadBalancing();
FunctionPass *createAArch64A53Fix835769();
FunctionPass *createFalkorMarkStridedAccessesPass();

FunctionPass *createAArch64CleanupLocalDynamicTLSPass();

Expand All @@ -66,6 +67,7 @@ void initializeAArch64VectorByElementOptPass(PassRegistry&);
void initializeAArch64PromoteConstantPass(PassRegistry&);
void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
void initializeAArch64StorePairSuppressPass(PassRegistry&);
void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
void initializeLDTLSCleanupPass(PassRegistry&);
} // end namespace llvm

Expand Down
147 changes: 147 additions & 0 deletions llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -0,0 +1,147 @@
//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// For Falkor, we want to avoid HW prefetcher instruction tag collisions that
// may inhibit the HW prefetching. This is done in two steps. Before ISel, we
// mark strided loads (i.e. those that will likely benefit from prefetching)
// with metadata. Then, after opcodes have been finalized, we insert MOVs and
// re-write loads to prevent unintnentional tag collisions.
// ===----------------------------------------------------------------------===//

#include "AArch64.h"
#include "AArch64InstrInfo.h"
#include "AArch64TargetMachine.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"

using namespace llvm;

#define DEBUG_TYPE "falkor-hwpf-fix"

STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");

namespace {

class FalkorMarkStridedAccesses {
public:
FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
: LI(LI), SE(SE) {}

bool run();

private:
bool runOnLoop(Loop *L);

LoopInfo &LI;
ScalarEvolution &SE;
};

class FalkorMarkStridedAccessesLegacy : public FunctionPass {
public:
static char ID; // Pass ID, replacement for typeid
FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
initializeFalkorMarkStridedAccessesLegacyPass(
*PassRegistry::getPassRegistry());
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetPassConfig>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
// FIXME: For some reason, preserving SE here breaks LSR (even if
// this pass changes nothing).
// AU.addPreserved<ScalarEvolutionWrapperPass>();
}

bool runOnFunction(Function &F) override;
};
} // namespace

char FalkorMarkStridedAccessesLegacy::ID = 0;
INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
"Falkor HW Prefetch Fix", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
"Falkor HW Prefetch Fix", false, false)

FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {
return new FalkorMarkStridedAccessesLegacy();
}

bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const AArch64Subtarget *ST =
TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
if (ST->getProcFamily() != AArch64Subtarget::Falkor)
return false;

if (skipFunction(F))
return false;

LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();

FalkorMarkStridedAccesses LDP(LI, SE);
return LDP.run();
}

bool FalkorMarkStridedAccesses::run() {
bool MadeChange = false;

for (Loop *I : LI)
for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
MadeChange |= runOnLoop(*L);

return MadeChange;
}

bool FalkorMarkStridedAccesses::runOnLoop(Loop *L) {
// Only mark strided loads in the inner-most loop
if (!L->empty())
return false;

bool MadeChange = false;

for (const auto BB : L->blocks()) {
for (auto &I : *BB) {
LoadInst *LoadI = dyn_cast<LoadInst>(&I);
if (!LoadI)
continue;

Value *PtrValue = LoadI->getPointerOperand();
if (L->isLoopInvariant(PtrValue))
continue;

const SCEV *LSCEV = SE.getSCEV(PtrValue);
const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
continue;

LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
MDNode::get(LoadI->getContext(), {}));
++NumStridedLoadsMarked;
DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
MadeChange = true;
}
}

return MadeChange;
}
8 changes: 8 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -7482,6 +7482,14 @@ AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
}

MachineMemOperand::Flags
AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
return MOStridedAccess;
return MachineMemOperand::MONone;
}

bool AArch64TargetLowering::isLegalInterleavedAccessType(
VectorType *VecTy, const DataLayout &DL) const {

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Expand Up @@ -455,6 +455,8 @@ class AArch64TargetLowering : public TargetLowering {
unsigned getNumInterleavedAccesses(VectorType *VecTy,
const DataLayout &DL) const;

MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;

private:
bool isExtFreeImpl(const Instruction *Ext) const override;

Expand Down
13 changes: 9 additions & 4 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
Expand Up @@ -52,9 +52,6 @@ using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
#include "AArch64GenInstrInfo.inc"

static const MachineMemOperand::Flags MOSuppressPair =
MachineMemOperand::MOTargetFlag1;

static cl::opt<unsigned>
TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
Expand Down Expand Up @@ -1715,6 +1712,13 @@ void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
(*MI.memoperands_begin())->setFlags(MOSuppressPair);
}

/// Check all MachineMemOperands for a hint that the load/store is strided.
bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const {
return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
return MMO->getFlags() & MOStridedAccess;
});
}

bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
switch (Opc) {
default:
Expand Down Expand Up @@ -4433,7 +4437,8 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
{{MOSuppressPair, "aarch64-suppress-pair"}};
{{MOSuppressPair, "aarch64-suppress-pair"},
{MOStridedAccess, "aarch64-strided-access"}};
return makeArrayRef(TargetFlags);
}

Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.h
Expand Up @@ -27,6 +27,13 @@ namespace llvm {
class AArch64Subtarget;
class AArch64TargetMachine;

static const MachineMemOperand::Flags MOSuppressPair =
MachineMemOperand::MOTargetFlag1;
static const MachineMemOperand::Flags MOStridedAccess =
MachineMemOperand::MOTargetFlag2;

#define FALKOR_STRIDED_ACCESS_MD "falkor.strided.access"

class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
const AArch64Subtarget &Subtarget;
Expand Down Expand Up @@ -81,6 +88,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
/// unprofitable.
bool isLdStPairSuppressed(const MachineInstr &MI) const;

/// Return true if the given load or store is a strided memory access.
bool isStridedAccess(const MachineInstr &MI) const;

/// Return true if this is an unscaled load/store.
bool isUnscaledLdSt(unsigned Opc) const;

Expand Down
12 changes: 10 additions & 2 deletions llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
Expand Up @@ -138,6 +138,9 @@ static cl::opt<int> EnableGlobalISelAtO(
cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
cl::init(-1));

static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
cl::init(true), cl::Hidden);

extern "C" void LLVMInitializeAArch64Target() {
// Register the target.
RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
Expand All @@ -158,6 +161,7 @@ extern "C" void LLVMInitializeAArch64Target() {
initializeAArch64PromoteConstantPass(*PR);
initializeAArch64RedundantCopyEliminationPass(*PR);
initializeAArch64StorePairSuppressPass(*PR);
initializeFalkorMarkStridedAccessesLegacyPass(*PR);
initializeLDTLSCleanupPass(*PR);
}

Expand Down Expand Up @@ -346,8 +350,12 @@ void AArch64PassConfig::addIRPasses() {
//
// Run this before LSR to remove the multiplies involved in computing the
// pointer values N iterations ahead.
if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
addPass(createLoopDataPrefetchPass());
if (TM->getOptLevel() != CodeGenOpt::None) {
if (EnableLoopDataPrefetch)
addPass(createLoopDataPrefetchPass());
if (EnableFalkorHWPFFix)
addPass(createFalkorMarkStridedAccessesPass());
}

TargetPassConfig::addIRPasses();

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AArch64/CMakeLists.txt
Expand Up @@ -47,6 +47,7 @@ add_llvm_target(AArch64CodeGen
AArch64ConditionalCompares.cpp
AArch64DeadRegisterDefinitionsPass.cpp
AArch64ExpandPseudoInsts.cpp
AArch64FalkorHWPFFix.cpp
AArch64FastISel.cpp
AArch64A53Fix835769.cpp
AArch64FrameLowering.cpp
Expand Down
106 changes: 106 additions & 0 deletions llvm/test/CodeGen/AArch64/falkor-hwpf.ll
@@ -0,0 +1,106 @@
; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor | FileCheck %s
; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s --check-prefix=NOHWPF

; Check that strided access metadata is added to loads in inner loops when compiling for Falkor.

; CHECK-LABEL: @hwpf1(
; CHECK: load i32, i32* %gep, !falkor.strided.access !0
; CHECK: load i32, i32* %gep2, !falkor.strided.access !0

; NOHWPF-LABEL: @hwpf1(
; NOHWPF: load i32, i32* %gep{{$}}
; NOHWPF: load i32, i32* %gep2{{$}}
define void @hwpf1(i32* %p, i32* %p2) {
entry:
br label %loop

loop:
%iv = phi i32 [ 0, %entry ], [ %inc, %loop ]

%gep = getelementptr inbounds i32, i32* %p, i32 %iv
%load = load i32, i32* %gep

%gep2 = getelementptr inbounds i32, i32* %p2, i32 %iv
%load2 = load i32, i32* %gep2

%inc = add i32 %iv, 1
%exitcnd = icmp uge i32 %inc, 1024
br i1 %exitcnd, label %exit, label %loop

exit:
ret void
}

; Check that outer loop strided load isn't marked.
; CHECK-LABEL: @hwpf2(
; CHECK: load i32, i32* %gep, !falkor.strided.access !0
; CHECK: load i32, i32* %gep2{{$}}

; NOHWPF-LABEL: @hwpf2(
; NOHWPF: load i32, i32* %gep{{$}}
; NOHWPF: load i32, i32* %gep2{{$}}
define void @hwpf2(i32* %p) {
entry:
br label %loop1

loop1:
%iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ]
%outer.sum = phi i32 [ 0, %entry ], [ %sum, %loop1.latch ]
br label %loop2.header

loop2.header:
br label %loop2

loop2:
%iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ]
%sum = phi i32 [ %outer.sum, %loop2.header ], [ %sum.inc, %loop2 ]
%gep = getelementptr inbounds i32, i32* %p, i32 %iv2
%load = load i32, i32* %gep
%sum.inc = add i32 %sum, %load
%inc2 = add i32 %iv2, 1
%exitcnd2 = icmp uge i32 %inc2, 1024
br i1 %exitcnd2, label %exit2, label %loop2

exit2:
%gep2 = getelementptr inbounds i32, i32* %p, i32 %iv1
%load2 = load i32, i32* %gep2
br label %loop1.latch

loop1.latch:
%inc1 = add i32 %iv1, 1
%exitcnd1 = icmp uge i32 %inc1, 1024
br i1 %exitcnd2, label %exit, label %loop1

exit:
ret void
}


; Check that non-strided load isn't marked.
; CHECK-LABEL: @hwpf3(
; CHECK: load i32, i32* %gep, !falkor.strided.access !0
; CHECK: load i32, i32* %gep2{{$}}

; NOHWPF-LABEL: @hwpf3(
; NOHWPF: load i32, i32* %gep{{$}}
; NOHWPF: load i32, i32* %gep2{{$}}
define void @hwpf3(i32* %p, i32* %p2) {
entry:
br label %loop

loop:
%iv = phi i32 [ 0, %entry ], [ %inc, %loop ]

%gep = getelementptr inbounds i32, i32* %p, i32 %iv
%load = load i32, i32* %gep

%gep2 = getelementptr inbounds i32, i32* %p2, i32 %load
%load2 = load i32, i32* %gep2

%inc = add i32 %iv, 1
%exitcnd = icmp uge i32 %inc, 1024
br i1 %exitcnd, label %exit, label %loop

exit:
ret void
}

0 comments on commit b1e8714

Please sign in to comment.