diff --git a/clang/test/CodeGen/builtins-hexagon.c b/clang/test/CodeGen/builtins-hexagon.c index 9a1b733da5cdb..52073f27ae70f 100644 --- a/clang/test/CodeGen/builtins-hexagon.c +++ b/clang/test/CodeGen/builtins-hexagon.c @@ -1,5 +1,5 @@ // REQUIRES: hexagon-registered-target -// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-cpu hexagonv65 -target-feature +hvxv65 -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-cpu hexagonv65 -target-feature +hvxv65 -target-feature +hvx-length128b -emit-llvm %s -o - | FileCheck %s void test() { int v64 __attribute__((__vector_size__(64))); diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt index a22a5c11e6ab3..cdc062eee72b1 100644 --- a/llvm/lib/Target/Hexagon/CMakeLists.txt +++ b/llvm/lib/Target/Hexagon/CMakeLists.txt @@ -43,6 +43,7 @@ add_llvm_target(HexagonCodeGen HexagonISelDAGToDAGHVX.cpp HexagonISelLowering.cpp HexagonISelLoweringHVX.cpp + HexagonLoopAlign.cpp HexagonLoopIdiomRecognition.cpp HexagonMachineFunctionInfo.cpp HexagonMachineScheduler.cpp diff --git a/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp b/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp new file mode 100644 index 0000000000000..c79b528ff2f3f --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp @@ -0,0 +1,216 @@ +//===----- HexagonLoopAlign.cpp - Generate loop alignment directives -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Inspect a basic block and if its single basic block loop with a small +// number of instructions, set the prefLoopAlignment to 32 bytes (5). +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "hexagon-loop-align" + +#include "HexagonTargetMachine.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/SchedulerRegistry.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +static cl::opt + DisableLoopAlign("disable-hexagon-loop-align", cl::Hidden, + cl::desc("Disable Hexagon loop alignment pass")); + +static cl::opt HVXLoopAlignLimitUB( + "hexagon-hvx-loop-align-limit-ub", cl::Hidden, cl::init(16), + cl::desc("Set hexagon hvx loop upper bound align limit")); + +static cl::opt TinyLoopAlignLimitUB( + "hexagon-tiny-loop-align-limit-ub", cl::Hidden, cl::init(16), + cl::desc("Set hexagon tiny-core loop upper bound align limit")); + +static cl::opt + LoopAlignLimitUB("hexagon-loop-align-limit-ub", cl::Hidden, cl::init(8), + cl::desc("Set hexagon loop upper bound align limit")); + +static cl::opt + LoopAlignLimitLB("hexagon-loop-align-limit-lb", cl::Hidden, cl::init(4), + cl::desc("Set hexagon loop lower bound align limit")); + +static cl::opt + LoopBndlAlignLimit("hexagon-loop-bundle-align-limit", cl::Hidden, + cl::init(4), + cl::desc("Set hexagon loop align bundle limit")); + +static cl::opt TinyLoopBndlAlignLimit( + "hexagon-tiny-loop-bundle-align-limit", cl::Hidden, cl::init(8), + cl::desc("Set hexagon tiny-core loop align bundle limit")); + +static cl::opt + LoopEdgeThreshold("hexagon-loop-edge-threshold", cl::Hidden, cl::init(7500), + cl::desc("Set hexagon loop align edge theshold")); + +namespace llvm { +FunctionPass *createHexagonLoopAlign(); +void initializeHexagonLoopAlignPass(PassRegistry &); +} // namespace llvm + +namespace { + +class HexagonLoopAlign : public MachineFunctionPass { + const HexagonSubtarget *HST = nullptr; + const TargetMachine *HTM = nullptr; + const HexagonInstrInfo *HII = nullptr; + +public: + static char ID; + HexagonLoopAlign() : MachineFunctionPass(ID) { + initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry()); + } + bool shouldBalignLoop(MachineBasicBlock &BB, bool AboveThres); + bool isSingleLoop(MachineBasicBlock &MBB); + bool attemptToBalignSmallLoop(MachineFunction &MF, MachineBasicBlock &MBB); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Hexagon LoopAlign pass"; } + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +char HexagonLoopAlign::ID = 0; + +bool HexagonLoopAlign::shouldBalignLoop(MachineBasicBlock &BB, + bool AboveThres) { + bool isVec = false; + unsigned InstCnt = 0; + unsigned BndlCnt = 0; + + for (MachineBasicBlock::instr_iterator II = BB.instr_begin(), + IE = BB.instr_end(); + II != IE; ++II) { + + // End if the instruction is endloop. + if (HII->isEndLoopN(II->getOpcode())) + break; + // Count the number of bundles. + if (II->isBundle()) { + BndlCnt++; + continue; + } + // Skip over debug instructions. + if (II->isDebugInstr()) + continue; + // Check if there are any HVX instructions in loop. + isVec |= HII->isHVXVec(*II); + // Count the number of instructions. + InstCnt++; + } + + LLVM_DEBUG({ + dbgs() << "Bundle Count : " << BndlCnt << "\n"; + dbgs() << "Instruction Count : " << InstCnt << "\n"; + }); + + unsigned LimitUB = 0; + unsigned LimitBndl = LoopBndlAlignLimit; + // The conditions in the order of priority. + if (HST->isTinyCore()) { + LimitUB = TinyLoopAlignLimitUB; + LimitBndl = TinyLoopBndlAlignLimit; + } else if (isVec) + LimitUB = HVXLoopAlignLimitUB; + else if (AboveThres) + LimitUB = LoopAlignLimitUB; + + // if the upper bound is not set to a value, implies we didn't meet + // the criteria. + if (LimitUB == 0) + return false; + + return InstCnt >= LoopAlignLimitLB && InstCnt <= LimitUB && + BndlCnt <= LimitBndl; +} + +bool HexagonLoopAlign::isSingleLoop(MachineBasicBlock &MBB) { + int Succs = MBB.succ_size(); + return (MBB.isSuccessor(&MBB) && (Succs == 2)); +} + +bool HexagonLoopAlign::attemptToBalignSmallLoop(MachineFunction &MF, + MachineBasicBlock &MBB) { + if (!isSingleLoop(MBB)) + return false; + + const MachineBranchProbabilityInfo *MBPI = + &getAnalysis(); + const MachineBlockFrequencyInfo *MBFI = + &getAnalysis(); + + // Compute frequency of back edge, + BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB); + BranchProbability BrProb = MBPI->getEdgeProbability(&MBB, &MBB); + BlockFrequency EdgeFreq = BlockFreq * BrProb; + LLVM_DEBUG({ + dbgs() << "Loop Align Pass:\n"; + dbgs() << "\tedge with freq(" << EdgeFreq.getFrequency() << ")\n"; + }); + + bool AboveThres = EdgeFreq.getFrequency() > LoopEdgeThreshold; + if (shouldBalignLoop(MBB, AboveThres)) { + // We found a loop, change its alignment to be 32 (5). + MBB.setAlignment(llvm::Align(1 << 5)); + return true; + } + return false; +} + +// Inspect each basic block, and if its a single BB loop, see if it +// meets the criteria for increasing alignment to 32. + +bool HexagonLoopAlign::runOnMachineFunction(MachineFunction &MF) { + + HST = &MF.getSubtarget(); + HII = HST->getInstrInfo(); + HTM = &MF.getTarget(); + + if (skipFunction(MF.getFunction())) + return false; + if (DisableLoopAlign) + return false; + + // This optimization is performed at + // i) -O2 and above, and when the loop has a HVX instruction. + // ii) -O3 + if (HST->useHVXOps()) { + if (HTM->getOptLevel() < CodeGenOptLevel::Default) + return false; + } else { + if (HTM->getOptLevel() < CodeGenOptLevel::Aggressive) + return false; + } + + bool Changed = false; + for (MachineFunction::iterator MBBi = MF.begin(), MBBe = MF.end(); + MBBi != MBBe; ++MBBi) { + MachineBasicBlock &MBB = *MBBi; + Changed |= attemptToBalignSmallLoop(MF, MBB); + } + return Changed; +} + +} // namespace + +INITIALIZE_PASS(HexagonLoopAlign, "hexagon-loop-align", + "Hexagon LoopAlign pass", false, false) + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// + +FunctionPass *llvm::createHexagonLoopAlign() { return new HexagonLoopAlign(); } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 7d77286339399..3c346c334d6d3 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -164,6 +164,7 @@ namespace llvm { void initializeHexagonGenMuxPass(PassRegistry&); void initializeHexagonHardwareLoopsPass(PassRegistry&); void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &); + void initializeHexagonLoopAlignPass(PassRegistry &); void initializeHexagonNewValueJumpPass(PassRegistry&); void initializeHexagonOptAddrModePass(PassRegistry&); void initializeHexagonPacketizerPass(PassRegistry&); @@ -194,6 +195,7 @@ namespace llvm { FunctionPass *createHexagonHardwareLoops(); FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM, CodeGenOptLevel OptLevel); + FunctionPass *createHexagonLoopAlign(); FunctionPass *createHexagonLoopRescheduling(); FunctionPass *createHexagonNewValueJump(); FunctionPass *createHexagonOptAddrMode(); @@ -256,8 +258,10 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, TT, CPU, FS, Options, getEffectiveRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Small), (HexagonNoOpt ? CodeGenOptLevel::None : OL)), - TLOF(std::make_unique()) { + TLOF(std::make_unique()), + Subtarget(Triple(TT), CPU, FS, *this) { initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry()); + initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry()); initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry()); initAsmInfo(); } @@ -476,6 +480,9 @@ void HexagonPassConfig::addPreEmitPass() { // Packetization is mandatory: it handles gather/scatter at all opt levels. addPass(createHexagonPacketizer(NoOpt)); + if (!NoOpt) + addPass(createHexagonLoopAlign()); + if (EnableVectorPrint) addPass(createHexagonVectorPrint()); diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h index c5fed0cd65a81..34ff45b6acf34 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h @@ -23,6 +23,7 @@ namespace llvm { class HexagonTargetMachine : public LLVMTargetMachine { std::unique_ptr TLOF; + HexagonSubtarget Subtarget; mutable StringMap> SubtargetMap; public: diff --git a/llvm/test/CodeGen/Hexagon/loop-balign.ll b/llvm/test/CodeGen/Hexagon/loop-balign.ll new file mode 100644 index 0000000000000..9d1f42a4b14b1 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/loop-balign.ll @@ -0,0 +1,91 @@ +; RUN: llc -march=hexagon -O3 < %s | FileCheck %s -check-prefix=BALIGN +; BALIGN: .p2align{{.*}}5 + +; The test for checking the alignment of 'for.body4.for.body4_crit_edge' basic block + +define dso_local void @foo(i32 %nCol, i32 %nRow, ptr nocapture %resMat) local_unnamed_addr { +entry: + %shl = shl i32 %nRow, 2 + %cmp36 = icmp sgt i32 %nRow, 0 + %0 = add i32 %nCol, -1 + %.inv = icmp slt i32 %0, 1 + %1 = select i1 %.inv, i32 1, i32 %nCol + br label %Outerloop + +Outerloop: ; preds = %for.end7, %entry + %r12.0 = phi i32 [ 0, %entry ], [ %inc8, %for.end7 ] + %r7_6.0 = phi i64 [ undef, %entry ], [ %r7_6.1.lcssa, %for.end7 ] + %r0i.0 = phi i32 [ undef, %entry ], [ %r0i.1.lcssa, %for.end7 ] + %r5.0 = phi ptr [ %resMat, %entry ], [ %r5.1.lcssa, %for.end7 ] + %r8.0 = phi i32 [ %shl, %entry ], [ %r8.1.lcssa, %for.end7 ] + br i1 %cmp36, label %for.body.lr.ph, label %for.end7 + +for.body.lr.ph: ; preds = %Outerloop + %cmp332 = icmp eq i32 %r12.0, 0 + %exitcond.peel = icmp eq i32 %r12.0, 1 + br label %for.body + +for.body: ; preds = %for.end, %for.body.lr.ph + %r8.141 = phi i32 [ %r8.0, %for.body.lr.ph ], [ %add, %for.end ] + %r5.140 = phi ptr [ %r5.0, %for.body.lr.ph ], [ %add.ptr, %for.end ] + %i.039 = phi i32 [ 0, %for.body.lr.ph ], [ %inc6, %for.end ] + %r0i.138 = phi i32 [ %r0i.0, %for.body.lr.ph ], [ %4, %for.end ] + %r7_6.137 = phi i64 [ %r7_6.0, %for.body.lr.ph ], [ %r7_6.2.lcssa, %for.end ] + %add = add nsw i32 %r8.141, %shl + br i1 %cmp332, label %for.end, label %for.body4.peel + +for.body4.peel: ; preds = %for.body + %r1i.0.in.peel = inttoptr i32 %r8.141 to ptr + %r1i.0.peel = load i32, ptr %r1i.0.in.peel, align 4 + %2 = tail call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %r7_6.137, i32 %r1i.0.peel, i32 %r0i.138) + br i1 %exitcond.peel, label %for.end, label %for.body4.preheader.peel.newph + +for.body4.preheader.peel.newph: ; preds = %for.body4.peel + %r1i.0.in = inttoptr i32 %add to ptr + %r1i.0 = load i32, ptr %r1i.0.in, align 4 + br label %for.body4 + +for.body4: ; preds = %for.body4.for.body4_crit_edge, %for.body4.preheader.peel.newph + %inc.phi = phi i32 [ %inc.0, %for.body4.for.body4_crit_edge ], [ 2, %for.body4.preheader.peel.newph ] + %r7_6.233 = phi i64 [ %3, %for.body4.for.body4_crit_edge ], [ %2, %for.body4.preheader.peel.newph ] + %3 = tail call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %r7_6.233, i32 %r1i.0, i32 %r0i.138) + %exitcond = icmp eq i32 %inc.phi, %r12.0 + br i1 %exitcond, label %for.end.loopexit, label %for.body4.for.body4_crit_edge + +for.body4.for.body4_crit_edge: ; preds = %for.body4 + %inc.0 = add nuw nsw i32 %inc.phi, 1 + br label %for.body4 + +for.end.loopexit: ; preds = %for.body4 + br label %for.end + +for.end: ; preds = %for.end.loopexit, %for.body4.peel, %for.body + %r7_6.2.lcssa = phi i64 [ %r7_6.137, %for.body ], [ %2, %for.body4.peel ], [ %3, %for.end.loopexit ] + %4 = tail call i32 @llvm.hexagon.S2.clbp(i64 %r7_6.2.lcssa) + store i32 %4, ptr %r5.140, align 4 + %add.ptr = getelementptr inbounds i8, ptr %r5.140, i32 undef + %inc6 = add nuw nsw i32 %i.039, 1 + %exitcond47 = icmp eq i32 %inc6, %nRow + br i1 %exitcond47, label %for.end7.loopexit, label %for.body + +for.end7.loopexit: ; preds = %for.end + br label %for.end7 + +for.end7: ; preds = %for.end7.loopexit, %Outerloop + %r7_6.1.lcssa = phi i64 [ %r7_6.0, %Outerloop ], [ %r7_6.2.lcssa, %for.end7.loopexit ] + %r0i.1.lcssa = phi i32 [ %r0i.0, %Outerloop ], [ %4, %for.end7.loopexit ] + %r5.1.lcssa = phi ptr [ %r5.0, %Outerloop ], [ %add.ptr, %for.end7.loopexit ] + %r8.1.lcssa = phi i32 [ %r8.0, %Outerloop ], [ %add, %for.end7.loopexit ] + %inc8 = add nuw i32 %r12.0, 1 + %exitcond48 = icmp eq i32 %inc8, %1 + br i1 %exitcond48, label %if.end, label %Outerloop + +if.end: ; preds = %for.end7 + ret void +} + +; Function Attrs: nounwind readnone +declare i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64, i32, i32) + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.S2.clbp(i64) diff --git a/llvm/test/CodeGen/Hexagon/loop_align_count.ll b/llvm/test/CodeGen/Hexagon/loop_align_count.ll new file mode 100644 index 0000000000000..07d7e4a8d6117 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/loop_align_count.ll @@ -0,0 +1,115 @@ +; RUN: llc -march=hexagon -mcpu=hexagonv73 -O2 -mattr=+hvxv73,hvx-length64b \ +; RUN: -debug-only=hexagon-loop-align 2>&1 < %s | FileCheck %s +; Validate that there are 4 bundles in the loop. + +; CHECK: Loop Align Pass: +; CHECK: Bundle Count : 4 +; CHECK: .p2align{{.*}}5 + +; Function Attrs: nounwind +define void @ham(ptr noalias nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3, ptr noalias nocapture %arg4, i32 %arg5) #0 { +bb: + %ashr = ashr i32 %arg3, 2 + %ashr6 = ashr i32 %arg3, 1 + %add = add nsw i32 %ashr6, %ashr + %icmp = icmp sgt i32 %arg2, 0 + br i1 %icmp, label %bb7, label %bb61 + +bb7: ; preds = %bb + %sdiv = sdiv i32 %arg1, 64 + %icmp8 = icmp sgt i32 %arg1, 63 + br label %bb9 + +bb9: ; preds = %bb57, %bb7 + %phi = phi i32 [ 0, %bb7 ], [ %add58, %bb57 ] + %ashr10 = ashr exact i32 %phi, 1 + %mul = mul nsw i32 %ashr10, %arg3 + br i1 %icmp8, label %bb11, label %bb57 + +bb11: ; preds = %bb9 + %add12 = add nsw i32 %phi, 1 + %mul13 = mul nsw i32 %add12, %arg5 + %mul14 = mul nsw i32 %phi, %arg5 + %add15 = add i32 %add, %mul + %add16 = add i32 %mul, %ashr + %add17 = add i32 %mul, %ashr6 + %getelementptr = getelementptr inbounds i8, ptr %arg4, i32 %mul13 + %getelementptr18 = getelementptr inbounds i8, ptr %arg4, i32 %mul14 + %getelementptr19 = getelementptr inbounds i16, ptr %arg, i32 %add15 + %getelementptr20 = getelementptr inbounds i16, ptr %arg, i32 %add16 + %getelementptr21 = getelementptr inbounds i16, ptr %arg, i32 %add17 + %getelementptr22 = getelementptr inbounds i16, ptr %arg, i32 %mul + %bitcast = bitcast ptr %getelementptr to ptr + %bitcast23 = bitcast ptr %getelementptr18 to ptr + %bitcast24 = bitcast ptr %getelementptr19 to ptr + %bitcast25 = bitcast ptr %getelementptr20 to ptr + %bitcast26 = bitcast ptr %getelementptr21 to ptr + %bitcast27 = bitcast ptr %getelementptr22 to ptr + br label %bb28 + +bb28: ; preds = %bb28, %bb11 + %phi29 = phi i32 [ 0, %bb11 ], [ %add54, %bb28 ] + %phi30 = phi ptr [ %bitcast27, %bb11 ], [ %getelementptr36, %bb28 ] + %phi31 = phi ptr [ %bitcast26, %bb11 ], [ %getelementptr37, %bb28 ] + %phi32 = phi ptr [ %bitcast25, %bb11 ], [ %getelementptr39, %bb28 ] + %phi33 = phi ptr [ %bitcast24, %bb11 ], [ %getelementptr41, %bb28 ] + %phi34 = phi ptr [ %bitcast, %bb11 ], [ %getelementptr53, %bb28 ] + %phi35 = phi ptr [ %bitcast23, %bb11 ], [ %getelementptr52, %bb28 ] + %getelementptr36 = getelementptr inbounds <16 x i32>, ptr %phi30, i32 1 + %load = load <16 x i32>, ptr %phi30, align 64 + %getelementptr37 = getelementptr inbounds <16 x i32>, ptr %phi31, i32 1 + %load38 = load <16 x i32>, ptr %phi31, align 64 + %getelementptr39 = getelementptr inbounds <16 x i32>, ptr %phi32, i32 1 + %load40 = load <16 x i32>, ptr %phi32, align 64 + %getelementptr41 = getelementptr inbounds <16 x i32>, ptr %phi33, i32 1 + %load42 = load <16 x i32>, ptr %phi33, align 64 + %call = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load, <16 x i32> %load38) + %call43 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load, <16 x i32> %load38) + %call44 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load40, <16 x i32> %load42) + %call45 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load40, <16 x i32> %load42) + %call46 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call, <16 x i32> %call44) + %call47 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call, <16 x i32> %call44) + %call48 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call43, <16 x i32> %call45) + %call49 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call43, <16 x i32> %call45) + %call50 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call47, <16 x i32> %call46) + %call51 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call49, <16 x i32> %call48) + %getelementptr52 = getelementptr inbounds <16 x i32>, ptr %phi35, i32 1 + store <16 x i32> %call50, ptr %phi35, align 64 + %getelementptr53 = getelementptr inbounds <16 x i32>, ptr %phi34, i32 1 + store <16 x i32> %call51, ptr %phi34, align 64 + %add54 = add nsw i32 %phi29, 1 + %icmp55 = icmp slt i32 %add54, %sdiv + br i1 %icmp55, label %bb28, label %bb56 + +bb56: ; preds = %bb28 + br label %bb57 + +bb57: ; preds = %bb56, %bb9 + %add58 = add nsw i32 %phi, 2 + %icmp59 = icmp slt i32 %add58, %arg2 + br i1 %icmp59, label %bb9, label %bb60 + +bb60: ; preds = %bb57 + br label %bb61 + +bb61: ; preds = %bb60, %bb + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/Hexagon/loop_align_count.mir b/llvm/test/CodeGen/Hexagon/loop_align_count.mir new file mode 100644 index 0000000000000..afbd917f4f0db --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/loop_align_count.mir @@ -0,0 +1,130 @@ +# RUN: llc -march=hexagon -O3 -run-pass hexagon-loop-align -o - %s\ +# RUN: -debug-only=hexagon-loop-align -verify-machineinstrs 2>&1 | FileCheck %s + +# Test that we only count til endloop instruction and we align this +# loop to 32. +# CHECK: Loop Align Pass: +# CHECK: Instruction Count : 16 +# CHECK: bb.5 (align 32) +--- +name: fred +tracksRegLiveness: true + +body: | + bb.0: + successors: %bb.1(0x50000000), %bb.8(0x30000000) + liveins: $r0, $r1, $r2, $r3, $r4, $r5 + + renamable $p0 = C2_cmpgti renamable $r2, 0 + J2_jumpf killed renamable $p0, %bb.8, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $r5 + + renamable $r7 = A2_addi killed renamable $r2, 1 + renamable $r8 = S2_asr_i_r renamable $r1, 31 + renamable $p0 = C2_cmpgti renamable $r1, 63 + renamable $r2 = S2_asr_i_r renamable $r3, 2 + renamable $r6 = S2_asr_i_r renamable $r3, 1 + renamable $r9 = S2_lsr_i_r killed renamable $r7, 1 + renamable $r1 = S2_lsr_i_r_acc killed renamable $r1, killed renamable $r8, 26 + renamable $r7 = A2_tfrsi 0 + renamable $r1 = S2_asr_i_r killed renamable $r1, 6 + J2_loop1r %bb.2, killed renamable $r9, implicit-def $lc1, implicit-def $sa1 + renamable $r8 = nsw A2_add renamable $r6, renamable $r2 + + bb.2: + successors: %bb.3(0x40000000), %bb.7(0x40000000) + liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8 + + J2_jumpf renamable $p0, %bb.7, implicit-def dead $pc + J2_jump %bb.3, implicit-def dead $pc + + bb.3: + successors: %bb.4(0x80000000) + liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8 + + renamable $r13 = exact S2_asr_i_r renamable $r7, 1 + renamable $r12 = COPY renamable $r4 + renamable $r9 = COPY renamable $r4 + renamable $r14 = nsw A2_addi renamable $r7, 1 + renamable $r15 = nsw M2_mpyi killed renamable $r13, renamable $r3 + renamable $r9 = M2_maci killed renamable $r9, killed renamable $r14, renamable $r5 + renamable $r13 = A2_add renamable $r8, renamable $r15 + renamable $r28 = A2_add renamable $r15, renamable $r2 + renamable $r10 = A2_add renamable $r15, renamable $r6 + renamable $r12 = M2_maci killed renamable $r12, renamable $r7, renamable $r5 + renamable $r13 = S2_addasl_rrri renamable $r0, killed renamable $r13, 1 + renamable $r14 = S2_addasl_rrri renamable $r0, killed renamable $r15, 1 + renamable $r15 = S2_addasl_rrri renamable $r0, killed renamable $r28, 1 + renamable $r28 = S2_addasl_rrri renamable $r0, killed renamable $r10, 1 + + bb.4: + successors: %bb.5(0x40000000), %bb.6(0x40000000) + liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r13, $r14, $r15, $r28 + + renamable $v0, renamable $r14 = V6_vL32b_pi killed renamable $r14, 64 + renamable $p1 = C2_cmpgtui renamable $r1, 1 + renamable $r10 = A2_addi renamable $r1, -1 + renamable $v2, renamable $r28 = V6_vL32b_pi killed renamable $r28, 64 + renamable $v1 = V6_vaddh renamable $v0, renamable $v2 + renamable $v3, renamable $r15 = V6_vL32b_pi killed renamable $r15, 64 + renamable $v0 = V6_vsubh killed renamable $v0, killed renamable $v2 + J2_loop0r %bb.5, killed renamable $r10, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + renamable $v4, renamable $r13 = V6_vL32b_pi killed renamable $r13, 64 + renamable $v2 = V6_vaddh renamable $v3, renamable $v4 + J2_jumpf killed renamable $p1, %bb.6, implicit-def $pc + J2_jump %bb.5, implicit-def $pc + + bb.5: + successors: %bb.5(0x7c000000), %bb.6(0x04000000) + liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r13, $r14, $r15, $r28, $v0, $v1, $v2, $v3, $v4 + + renamable $v3 = V6_vsubh killed renamable $v3, killed renamable $v4 + renamable $v4, renamable $r14 = V6_vL32b_pi killed renamable $r14, 64 + renamable $v5 = V6_vnavgh renamable $v1, renamable $v2 + renamable $v1 = V6_vavgh killed renamable $v1, killed renamable $v2 + renamable $v2, renamable $r28 = V6_vL32b_pi killed renamable $r28, 64 + renamable $v1 = V6_vsathub killed renamable $v5, killed renamable $v1 + renamable $v5 = V6_vnavgh renamable $v0, renamable $v3 + renamable $v6 = V6_vavgh killed renamable $v0, killed renamable $v3 + renamable $r12 = V6_vS32b_pi killed renamable $r12, 64, killed renamable $v1 + renamable $v1 = V6_vaddh renamable $v4, renamable $v2 + renamable $v3, renamable $r15 = V6_vL32b_pi killed renamable $r15, 64 + renamable $v0 = V6_vsubh killed renamable $v4, killed renamable $v2 + renamable $v4, renamable $r13 = V6_vL32b_pi killed renamable $r13, 64 + renamable $v2 = V6_vaddh renamable $v3, renamable $v4 + renamable $v5 = V6_vsathub killed renamable $v5, killed renamable $v6 + renamable $r9 = V6_vS32b_pi killed renamable $r9, 64, killed renamable $v5 + ENDLOOP0 %bb.5, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.6, implicit-def $pc + + bb.6: + successors: %bb.7(0x80000000) + liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $v0, $v1, $v2, $v3, $v4 + + renamable $v3 = V6_vsubh killed renamable $v3, killed renamable $v4 + renamable $v4 = V6_vavgh renamable $v1, renamable $v2 + renamable $v1 = V6_vnavgh killed renamable $v1, killed renamable $v2 + renamable $v2 = V6_vavgh renamable $v0, renamable $v3 + renamable $v0 = V6_vnavgh killed renamable $v0, killed renamable $v3 + renamable $v1 = V6_vsathub killed renamable $v1, killed renamable $v4 + dead renamable $r12 = V6_vS32b_pi killed renamable $r12, 64, killed renamable $v1 + renamable $v0 = V6_vsathub killed renamable $v0, killed renamable $v2 + dead renamable $r9 = V6_vS32b_pi killed renamable $r9, 64, killed renamable $v0 + J2_jump %bb.7, implicit-def $pc + + bb.7: + successors: %bb.2(0x7c000000), %bb.8(0x04000000) + liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8 + + renamable $r7 = nsw A2_addi killed renamable $r7, 2 + ENDLOOP1 %bb.2, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1 + J2_jump %bb.8, implicit-def dead $pc + + bb.8: + PS_jmpret $r31, implicit-def dead $pc + +... diff --git a/llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll b/llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll new file mode 100644 index 0000000000000..6b3c0a94a494d --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll @@ -0,0 +1,117 @@ +; RUN: llc -march=hexagon -mcpu=hexagonv73 -O2 -mattr=+hvxv73,hvx-length64b < %s | FileCheck %s +; CHECK: .p2align{{.*}}5 + +; Function Attrs: nounwind +define void @wobble(ptr noalias nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3, ptr noalias nocapture %arg4, i32 %arg5) #0 { +bb: + %ashr = ashr i32 %arg3, 2 + %ashr6 = ashr i32 %arg3, 1 + %add = add nsw i32 %ashr6, %ashr + %icmp = icmp sgt i32 %arg2, 0 + br i1 %icmp, label %bb7, label %bb61 + +bb7: ; preds = %bb + %sdiv = sdiv i32 %arg1, 64 + %icmp8 = icmp sgt i32 %arg1, 63 + br label %bb9 + +bb9: ; preds = %bb57, %bb7 + %phi = phi i32 [ 0, %bb7 ], [ %add58, %bb57 ] + %ashr10 = ashr exact i32 %phi, 1 + %mul = mul nsw i32 %ashr10, %arg3 + br i1 %icmp8, label %bb11, label %bb57 + +bb11: ; preds = %bb9 + %add12 = add nsw i32 %phi, 1 + %mul13 = mul nsw i32 %add12, %arg5 + %mul14 = mul nsw i32 %phi, %arg5 + %add15 = add i32 %add, %mul + %add16 = add i32 %mul, %ashr + %add17 = add i32 %mul, %ashr6 + %getelementptr = getelementptr inbounds i8, ptr %arg4, i32 %mul13 + %getelementptr18 = getelementptr inbounds i8, ptr %arg4, i32 %mul14 + %getelementptr19 = getelementptr inbounds i16, ptr %arg, i32 %add15 + %getelementptr20 = getelementptr inbounds i16, ptr %arg, i32 %add16 + %getelementptr21 = getelementptr inbounds i16, ptr %arg, i32 %add17 + %getelementptr22 = getelementptr inbounds i16, ptr %arg, i32 %mul + %bitcast = bitcast ptr %getelementptr to ptr + %bitcast23 = bitcast ptr %getelementptr18 to ptr + %bitcast24 = bitcast ptr %getelementptr19 to ptr + %bitcast25 = bitcast ptr %getelementptr20 to ptr + %bitcast26 = bitcast ptr %getelementptr21 to ptr + %bitcast27 = bitcast ptr %getelementptr22 to ptr + br label %bb28 + +bb28: ; preds = %bb28, %bb11 + %phi29 = phi i32 [ 0, %bb11 ], [ %add54, %bb28 ] + %phi30 = phi ptr [ %bitcast27, %bb11 ], [ %getelementptr36, %bb28 ] + %phi31 = phi ptr [ %bitcast26, %bb11 ], [ %getelementptr37, %bb28 ] + %phi32 = phi ptr [ %bitcast25, %bb11 ], [ %getelementptr39, %bb28 ] + %phi33 = phi ptr [ %bitcast24, %bb11 ], [ %getelementptr41, %bb28 ] + %phi34 = phi ptr [ %bitcast, %bb11 ], [ %getelementptr53, %bb28 ] + %phi35 = phi ptr [ %bitcast23, %bb11 ], [ %getelementptr52, %bb28 ] + %getelementptr36 = getelementptr inbounds <16 x i32>, ptr %phi30, i32 1 + %load = load <16 x i32>, ptr %phi30, align 64, !tbaa !1 + %getelementptr37 = getelementptr inbounds <16 x i32>, ptr %phi31, i32 1 + %load38 = load <16 x i32>, ptr %phi31, align 64, !tbaa !1 + %getelementptr39 = getelementptr inbounds <16 x i32>, ptr %phi32, i32 1 + %load40 = load <16 x i32>, ptr %phi32, align 64, !tbaa !1 + %getelementptr41 = getelementptr inbounds <16 x i32>, ptr %phi33, i32 1 + %load42 = load <16 x i32>, ptr %phi33, align 64, !tbaa !1 + %call = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load, <16 x i32> %load38) + %call43 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load, <16 x i32> %load38) + %call44 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load40, <16 x i32> %load42) + %call45 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load40, <16 x i32> %load42) + %call46 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call, <16 x i32> %call44) + %call47 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call, <16 x i32> %call44) + %call48 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call43, <16 x i32> %call45) + %call49 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call43, <16 x i32> %call45) + %call50 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call47, <16 x i32> %call46) + %call51 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call49, <16 x i32> %call48) + %getelementptr52 = getelementptr inbounds <16 x i32>, ptr %phi35, i32 1 + store <16 x i32> %call50, ptr %phi35, align 64, !tbaa !1 + %getelementptr53 = getelementptr inbounds <16 x i32>, ptr %phi34, i32 1 + store <16 x i32> %call51, ptr %phi34, align 64, !tbaa !1 + %add54 = add nsw i32 %phi29, 1 + %icmp55 = icmp slt i32 %add54, %sdiv + br i1 %icmp55, label %bb28, label %bb56 + +bb56: ; preds = %bb28 + br label %bb57 + +bb57: ; preds = %bb56, %bb9 + %add58 = add nsw i32 %phi, 2 + %icmp59 = icmp slt i32 %add58, %arg2 + br i1 %icmp59, label %bb9, label %bb60 + +bb60: ; preds = %bb57 + br label %bb61 + +bb61: ; preds = %bb60, %bb + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32>, <16 x i32>) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } + +!llvm.ident = !{!0} + +!0 = !{!"Clang 3.1"} +!1 = !{!2, !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"}