35 changes: 35 additions & 0 deletions compiler-rt/test/nsan/lit.cfg.py
Original file line number Diff line number Diff line change
@@ -1 +1,36 @@
config.name = "NSan" + config.name_suffix

# Setup source root.
config.test_source_root = os.path.dirname(__file__)

# Test suffixes.
config.suffixes = [".c", ".cpp", ".test"]

# C & CXX flags.
c_flags = [config.target_cflags]

# CXX flags
cxx_flags = c_flags + config.cxx_mode_flags + ["-std=c++17"]

nsan_flags = [
"-fsanitize=numerical",
"-g",
"-mno-omit-leaf-frame-pointer",
"-fno-omit-frame-pointer",
]


def build_invocation(compile_flags):
return " " + " ".join([config.clang] + compile_flags) + " "


# Add substitutions.
config.substitutions.append(("%clang ", build_invocation(c_flags)))
config.substitutions.append(("%clang_nsan ", build_invocation(c_flags + nsan_flags)))
config.substitutions.append(
("%clangxx_nsan ", build_invocation(cxx_flags + nsan_flags))
)

# NSan tests are currently supported on Linux only.
if config.host_os not in ["Linux"]:
config.unsupported = True
2 changes: 0 additions & 2 deletions compiler-rt/test/nsan/lit.site.cfg.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
config.name_suffix = "-@CONFIG_NAME@"
config.target_cflags = "@NSAN_TEST_TARGET_CFLAGS@"
config.target_arch = "@NSAN_TEST_TARGET_ARCH@"
config.use_lld = @NSAN_TEST_USE_LLD@
config.use_thinlto = @NSAN_TEST_USE_THINLTO@

# Load common config for all compiler-rt lit tests.
lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured")
Expand Down
70 changes: 70 additions & 0 deletions compiler-rt/test/nsan/sum.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=NaiveSum -DFLT=float %s -o %t
// RUN: NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 not %run %t 2>&1 | FileCheck %s

// RUN: %clangxx_nsan -O3 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=NaiveSum -DFLT=float %s -o %t
// RUN: NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 not %run %t 2>&1 | FileCheck %s

// RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=KahanSum -DFLT=float %s -o %t
// RUN: NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 %run %t

// RUN: %clangxx_nsan -O3 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=KahanSum -DFLT=float %s -o %t
// RUN: NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 %run %t

#include <chrono>
#include <iostream>
#include <random>
#include <vector>

// A naive, unstable summation.
template <typename T>
__attribute__((noinline)) // To check call stack reporting.
T NaiveSum(const std::vector<T>& values) {
T sum = 0;
for (T v : values) {
sum += v;
}
return sum;
// CHECK: WARNING: NumericalStabilitySanitizer: inconsistent shadow results while checking return
// CHECK: float{{ *}}precision (native):
// CHECK: double{{ *}}precision (shadow):
// CHECK: {{#0 .*in .* NaiveSum}}
}

// Kahan's summation is a numerically stable sum.
// https://en.wikipedia.org/wiki/Kahan_summation_algorithm
template <typename T>
__attribute__((noinline)) T KahanSum(const std::vector<T> &values) {
T sum = 0;
T c = 0;
for (T v : values) {
T y = v - c;
T t = sum + y;
c = (t - sum) - y;
sum = t;
}
return sum;
}

int main() {
std::vector<FLT> values;
constexpr int kNumValues = 1000000;
values.reserve(kNumValues);
// Using a seed to avoid flakiness.
constexpr uint32_t kSeed = 0x123456;
std::mt19937 gen(kSeed);
std::uniform_real_distribution<FLT> dis(0.0f, 1000.0f);
for (int i = 0; i < kNumValues; ++i) {
values.push_back(dis(gen));
}

const auto t1 = std::chrono::high_resolution_clock::now();
const auto sum = SUM(values);
const auto t2 = std::chrono::high_resolution_clock::now();
printf("sum: %.8f\n", sum);
std::cout << "runtime: "
<< std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1)
.count() /
1000.0
<< "ms\n";
return 0;
}
12 changes: 0 additions & 12 deletions lld/ELF/ScriptLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,18 +279,6 @@ bool ScriptLexer::consume(StringRef tok) {
return false;
}

// Consumes Tok followed by ":". Space is allowed between Tok and ":".
bool ScriptLexer::consumeLabel(StringRef tok) {
if (consume((tok + ":").str()))
return true;
if (tokens.size() >= pos + 2 && tokens[pos] == tok &&
tokens[pos + 1] == ":") {
pos += 2;
return true;
}
return false;
}

void ScriptLexer::skip() { (void)next(); }

void ScriptLexer::expect(StringRef expect) {
Expand Down
1 change: 0 additions & 1 deletion lld/ELF/ScriptLexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ class ScriptLexer {
void skip();
bool consume(StringRef tok);
void expect(StringRef expect);
bool consumeLabel(StringRef tok);
std::string getCurrentLocation();
MemoryBufferRef getCurrentMB();

Expand Down
16 changes: 8 additions & 8 deletions lld/ELF/ScriptParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1719,20 +1719,20 @@ ScriptParser::readSymbols() {
while (!errorCount()) {
if (consume("}"))
break;
if (consumeLabel("local")) {
v = &locals;
continue;
}
if (consumeLabel("global")) {
v = &globals;
continue;
}

if (consume("extern")) {
SmallVector<SymbolVersion, 0> ext = readVersionExtern();
v->insert(v->end(), ext.begin(), ext.end());
} else {
StringRef tok = next();
if (tok == "local:" || (tok == "local" && consume(":"))) {
v = &locals;
continue;
}
if (tok == "global:" || (tok == "global" && consume(":"))) {
v = &globals;
continue;
}
v->push_back({unquote(tok), false, hasWildcard(tok)});
}
expect(";");
Expand Down
14 changes: 0 additions & 14 deletions llvm/include/llvm/IR/IntrinsicsRISCV.td
Original file line number Diff line number Diff line change
Expand Up @@ -1710,20 +1710,6 @@ let TargetPrefix = "riscv" in {
defm vsuxseg # nf : RISCVISegStore<nf>;
}

// Strided loads/stores for fixed vectors.
def int_riscv_masked_strided_load
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_anyptr_ty,
llvm_anyint_ty,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
[NoCapture<ArgIndex<1>>, IntrReadMem]>;
def int_riscv_masked_strided_store
: DefaultAttrsIntrinsic<[],
[llvm_anyvector_ty, llvm_anyptr_ty,
llvm_anyint_ty,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
[NoCapture<ArgIndex<1>>, IntrWriteMem]>;

// Segment loads/stores for fixed vectors.
foreach nf = [2, 3, 4, 5, 6, 7, 8] in {
def int_riscv_seg # nf # _load
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ class LibCallSimplifier {
Value *optimizeSinCosPi(CallInst *CI, bool IsSin, IRBuilderBase &B);
Value *optimizeTrigInversionPairs(CallInst *CI, IRBuilderBase &B);
Value *optimizeSymmetric(CallInst *CI, LibFunc Func, IRBuilderBase &B);
Value *optimizeRemquo(CallInst *CI, IRBuilderBase &B);
// Wrapper for all floating point library call optimizations
Value *optimizeFloatingPointLibCall(CallInst *CI, LibFunc Func,
IRBuilderBase &B);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ add_llvm_component_library(LLVMGlobalISel
GlobalISel.cpp
Combiner.cpp
CombinerHelper.cpp
CombinerHelperCasts.cpp
CombinerHelperVectorOps.cpp
GIMatchTableExecutor.cpp
GISelChangeObserver.cpp
Expand Down
89 changes: 0 additions & 89 deletions llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7457,92 +7457,3 @@ void CombinerHelper::applyExpandFPowI(MachineInstr &MI, int64_t Exponent) {
Builder.buildCopy(Dst, *Res);
MI.eraseFromParent();
}

bool CombinerHelper::matchSextOfTrunc(const MachineOperand &MO,
BuildFnTy &MatchInfo) {
GSext *Sext = cast<GSext>(getDefIgnoringCopies(MO.getReg(), MRI));
GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Sext->getSrcReg(), MRI));

Register Dst = Sext->getReg(0);
Register Src = Trunc->getSrcReg();

LLT DstTy = MRI.getType(Dst);
LLT SrcTy = MRI.getType(Src);

if (DstTy == SrcTy) {
MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
return true;
}

if (DstTy.getScalarSizeInBits() < SrcTy.getScalarSizeInBits() &&
isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) {
MatchInfo = [=](MachineIRBuilder &B) {
B.buildTrunc(Dst, Src, MachineInstr::MIFlag::NoSWrap);
};
return true;
}

if (DstTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits() &&
isLegalOrBeforeLegalizer({TargetOpcode::G_SEXT, {DstTy, SrcTy}})) {
MatchInfo = [=](MachineIRBuilder &B) { B.buildSExt(Dst, Src); };
return true;
}

return false;
}

bool CombinerHelper::matchZextOfTrunc(const MachineOperand &MO,
BuildFnTy &MatchInfo) {
GZext *Zext = cast<GZext>(getDefIgnoringCopies(MO.getReg(), MRI));
GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Zext->getSrcReg(), MRI));

Register Dst = Zext->getReg(0);
Register Src = Trunc->getSrcReg();

LLT DstTy = MRI.getType(Dst);
LLT SrcTy = MRI.getType(Src);

if (DstTy == SrcTy) {
MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
return true;
}

if (DstTy.getScalarSizeInBits() < SrcTy.getScalarSizeInBits() &&
isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) {
MatchInfo = [=](MachineIRBuilder &B) {
B.buildTrunc(Dst, Src, MachineInstr::MIFlag::NoUWrap);
};
return true;
}

if (DstTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits() &&
isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) {
MatchInfo = [=](MachineIRBuilder &B) {
B.buildZExt(Dst, Src, MachineInstr::MIFlag::NonNeg);
};
return true;
}

return false;
}

bool CombinerHelper::matchNonNegZext(const MachineOperand &MO,
BuildFnTy &MatchInfo) {
GZext *Zext = cast<GZext>(MRI.getVRegDef(MO.getReg()));

Register Dst = Zext->getReg(0);
Register Src = Zext->getSrcReg();

LLT DstTy = MRI.getType(Dst);
LLT SrcTy = MRI.getType(Src);
const auto &TLI = getTargetLowering();

// Convert zext nneg to sext if sext is the preferred form for the target.
if (isLegalOrBeforeLegalizer({TargetOpcode::G_SEXT, {DstTy, SrcTy}}) &&
TLI.isSExtCheaperThanZExt(getMVTForLLT(SrcTy), getMVTForLLT(DstTy))) {
MatchInfo = [=](MachineIRBuilder &B) { B.buildSExt(Dst, Src); };
return true;
}

return false;
}
115 changes: 115 additions & 0 deletions llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
//===- CombinerHelperCasts.cpp---------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements CombinerHelper for G_ANYEXT, G_SEXT, G_TRUNC, and
// G_ZEXT
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/LowLevelTypeUtils.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/Support/Casting.h"

#define DEBUG_TYPE "gi-combiner"

using namespace llvm;

bool CombinerHelper::matchSextOfTrunc(const MachineOperand &MO,
BuildFnTy &MatchInfo) {
GSext *Sext = cast<GSext>(getDefIgnoringCopies(MO.getReg(), MRI));
GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Sext->getSrcReg(), MRI));

Register Dst = Sext->getReg(0);
Register Src = Trunc->getSrcReg();

LLT DstTy = MRI.getType(Dst);
LLT SrcTy = MRI.getType(Src);

if (DstTy == SrcTy) {
MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
return true;
}

if (DstTy.getScalarSizeInBits() < SrcTy.getScalarSizeInBits() &&
isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) {
MatchInfo = [=](MachineIRBuilder &B) {
B.buildTrunc(Dst, Src, MachineInstr::MIFlag::NoSWrap);
};
return true;
}

if (DstTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits() &&
isLegalOrBeforeLegalizer({TargetOpcode::G_SEXT, {DstTy, SrcTy}})) {
MatchInfo = [=](MachineIRBuilder &B) { B.buildSExt(Dst, Src); };
return true;
}

return false;
}

bool CombinerHelper::matchZextOfTrunc(const MachineOperand &MO,
BuildFnTy &MatchInfo) {
GZext *Zext = cast<GZext>(getDefIgnoringCopies(MO.getReg(), MRI));
GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Zext->getSrcReg(), MRI));

Register Dst = Zext->getReg(0);
Register Src = Trunc->getSrcReg();

LLT DstTy = MRI.getType(Dst);
LLT SrcTy = MRI.getType(Src);

if (DstTy == SrcTy) {
MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
return true;
}

if (DstTy.getScalarSizeInBits() < SrcTy.getScalarSizeInBits() &&
isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) {
MatchInfo = [=](MachineIRBuilder &B) {
B.buildTrunc(Dst, Src, MachineInstr::MIFlag::NoUWrap);
};
return true;
}

if (DstTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits() &&
isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) {
MatchInfo = [=](MachineIRBuilder &B) {
B.buildZExt(Dst, Src, MachineInstr::MIFlag::NonNeg);
};
return true;
}

return false;
}

bool CombinerHelper::matchNonNegZext(const MachineOperand &MO,
BuildFnTy &MatchInfo) {
GZext *Zext = cast<GZext>(MRI.getVRegDef(MO.getReg()));

Register Dst = Zext->getReg(0);
Register Src = Zext->getSrcReg();

LLT DstTy = MRI.getType(Dst);
LLT SrcTy = MRI.getType(Src);
const auto &TLI = getTargetLowering();

// Convert zext nneg to sext if sext is the preferred form for the target.
if (isLegalOrBeforeLegalizer({TargetOpcode::G_SEXT, {DstTy, SrcTy}}) &&
TLI.isSExtCheaperThanZExt(getMVTForLLT(SrcTy), getMVTForLLT(DstTy))) {
MatchInfo = [=](MachineIRBuilder &B) { B.buildSExt(Dst, Src); };
return true;
}

return false;
}
12 changes: 10 additions & 2 deletions llvm/lib/CodeGen/MachinePipeliner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -528,8 +528,16 @@ bool MachinePipeliner::useSwingModuloScheduler() {
}

bool MachinePipeliner::useWindowScheduler(bool Changed) {
// WindowScheduler does not work when it is off or when SwingModuloScheduler
// is successfully scheduled.
// WindowScheduler does not work for following cases:
// 1. when it is off.
// 2. when SwingModuloScheduler is successfully scheduled.
// 3. when pragma II is enabled.
if (II_setByPragma) {
LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
"llvm.loop.pipeline.initiationinterval is set.\n");
return false;
}

return WindowSchedulingOption == WindowSchedulingFlag::WS_Force ||
(WindowSchedulingOption == WindowSchedulingFlag::WS_On && !Changed);
}
Expand Down
21 changes: 14 additions & 7 deletions llvm/lib/CodeGen/WindowScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,11 @@ bool WindowScheduler::initialize() {
return false;
}
for (auto &Def : MI.all_defs())
if (Def.isReg() && Def.getReg().isPhysical())
if (Def.isReg() && Def.getReg().isPhysical()) {
LLVM_DEBUG(dbgs() << "Physical registers are not supported in "
"window scheduling!\n");
return false;
}
}
if (SchedInstrNum <= WindowRegionLimit) {
LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
Expand Down Expand Up @@ -437,12 +440,16 @@ int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
int PredCycle = getOriCycle(PredMI);
ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency());
}
// ResourceManager can be used to detect resource conflicts between the
// current MI and the previously inserted MIs.
while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
++CurCycle;
if (CurCycle == (int)WindowIILimit)
return CurCycle;
// Zero cost instructions do not need to check resource.
if (!TII->isZeroCost(MI.getOpcode())) {
// ResourceManager can be used to detect resource conflicts between the
// current MI and the previously inserted MIs.
while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
++CurCycle;
if (CurCycle == (int)WindowIILimit)
return CurCycle;
}
RM.reserveResources(*SU, CurCycle);
}
RM.reserveResources(*SU, CurCycle);
OriToCycle[getOriMI(&MI)] = CurCycle;
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1144,7 +1144,6 @@ def : PatGprGpr<urem, MOD_DU>;
def : PatGprGpr<loongarch_mod_wu, MOD_WU>;
def : PatGprGpr<rotr, ROTR_D>;
def : PatGprGpr<loongarch_rotr_w, ROTR_W>;
def : PatGprGpr_32<rotr, ROTR_W>;
def : PatGprImm<rotr, ROTRI_D, uimm6>;
def : PatGprImm_32<rotr, ROTRI_W, uimm5>;
def : PatGprImm<loongarch_rotr_w, ROTRI_W, uimm5>;
Expand Down
18 changes: 12 additions & 6 deletions llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -515,17 +515,23 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II,

Builder.SetInsertPoint(II);

Value *EVL = Builder.CreateElementCount(
IntegerType::get(Ctx, 32), cast<VectorType>(DataType)->getElementCount());

CallInst *Call;
if (II->getIntrinsicID() == Intrinsic::masked_gather)
if (II->getIntrinsicID() == Intrinsic::masked_gather) {
Call = Builder.CreateIntrinsic(
Intrinsic::riscv_masked_strided_load,
Intrinsic::experimental_vp_strided_load,
{DataType, BasePtr->getType(), Stride->getType()},
{II->getArgOperand(3), BasePtr, Stride, II->getArgOperand(2)});
else
{BasePtr, Stride, II->getArgOperand(2), EVL});
Call = Builder.CreateIntrinsic(
Intrinsic::vp_select, {DataType},
{II->getOperand(2), Call, II->getArgOperand(3), EVL});
} else
Call = Builder.CreateIntrinsic(
Intrinsic::riscv_masked_strided_store,
Intrinsic::experimental_vp_strided_store,
{DataType, BasePtr->getType(), Stride->getType()},
{II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3)});
{II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3), EVL});

Call->takeName(II);
II->replaceAllUsesWith(Call);
Expand Down
159 changes: 0 additions & 159 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1622,12 +1622,6 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
MachineMemOperand::MOVolatile;
return true;
case Intrinsic::riscv_masked_strided_load:
return SetRVVLoadStoreInfo(/*PtrOp*/ 1, /*IsStore*/ false,
/*IsUnitStrided*/ false);
case Intrinsic::riscv_masked_strided_store:
return SetRVVLoadStoreInfo(/*PtrOp*/ 1, /*IsStore*/ true,
/*IsUnitStrided*/ false);
case Intrinsic::riscv_seg2_load:
case Intrinsic::riscv_seg3_load:
case Intrinsic::riscv_seg4_load:
Expand Down Expand Up @@ -9414,81 +9408,6 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
switch (IntNo) {
default:
break;
case Intrinsic::riscv_masked_strided_load: {
SDLoc DL(Op);
MVT XLenVT = Subtarget.getXLenVT();

// If the mask is known to be all ones, optimize to an unmasked intrinsic;
// the selection of the masked intrinsics doesn't do this for us.
SDValue Mask = Op.getOperand(5);
bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());

MVT VT = Op->getSimpleValueType(0);
MVT ContainerVT = VT;
if (VT.isFixedLengthVector())
ContainerVT = getContainerForFixedLengthVector(VT);

SDValue PassThru = Op.getOperand(2);
if (!IsUnmasked) {
MVT MaskVT = getMaskTypeFor(ContainerVT);
if (VT.isFixedLengthVector()) {
Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
}
}

auto *Load = cast<MemIntrinsicSDNode>(Op);
SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
SDValue Ptr = Op.getOperand(3);
SDValue Stride = Op.getOperand(4);
SDValue Result, Chain;

// TODO: We restrict this to unmasked loads currently in consideration of
// the complexity of handling all falses masks.
MVT ScalarVT = ContainerVT.getVectorElementType();
if (IsUnmasked && isNullConstant(Stride) && ContainerVT.isInteger()) {
SDValue ScalarLoad =
DAG.getExtLoad(ISD::EXTLOAD, DL, XLenVT, Load->getChain(), Ptr,
ScalarVT, Load->getMemOperand());
Chain = ScalarLoad.getValue(1);
Result = lowerScalarSplat(SDValue(), ScalarLoad, VL, ContainerVT, DL, DAG,
Subtarget);
} else if (IsUnmasked && isNullConstant(Stride) && isTypeLegal(ScalarVT)) {
SDValue ScalarLoad = DAG.getLoad(ScalarVT, DL, Load->getChain(), Ptr,
Load->getMemOperand());
Chain = ScalarLoad.getValue(1);
Result = DAG.getSplat(ContainerVT, DL, ScalarLoad);
} else {
SDValue IntID = DAG.getTargetConstant(
IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL,
XLenVT);

SmallVector<SDValue, 8> Ops{Load->getChain(), IntID};
if (IsUnmasked)
Ops.push_back(DAG.getUNDEF(ContainerVT));
else
Ops.push_back(PassThru);
Ops.push_back(Ptr);
Ops.push_back(Stride);
if (!IsUnmasked)
Ops.push_back(Mask);
Ops.push_back(VL);
if (!IsUnmasked) {
SDValue Policy =
DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
Ops.push_back(Policy);
}

SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
Result =
DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
Load->getMemoryVT(), Load->getMemOperand());
Chain = Result.getValue(1);
}
if (VT.isFixedLengthVector())
Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
return DAG.getMergeValues({Result, Chain}, DL);
}
case Intrinsic::riscv_seg2_load:
case Intrinsic::riscv_seg3_load:
case Intrinsic::riscv_seg4_load:
Expand Down Expand Up @@ -9568,47 +9487,6 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
switch (IntNo) {
default:
break;
case Intrinsic::riscv_masked_strided_store: {
SDLoc DL(Op);
MVT XLenVT = Subtarget.getXLenVT();

// If the mask is known to be all ones, optimize to an unmasked intrinsic;
// the selection of the masked intrinsics doesn't do this for us.
SDValue Mask = Op.getOperand(5);
bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());

SDValue Val = Op.getOperand(2);
MVT VT = Val.getSimpleValueType();
MVT ContainerVT = VT;
if (VT.isFixedLengthVector()) {
ContainerVT = getContainerForFixedLengthVector(VT);
Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
}
if (!IsUnmasked) {
MVT MaskVT = getMaskTypeFor(ContainerVT);
if (VT.isFixedLengthVector())
Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
}

SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;

SDValue IntID = DAG.getTargetConstant(
IsUnmasked ? Intrinsic::riscv_vsse : Intrinsic::riscv_vsse_mask, DL,
XLenVT);

auto *Store = cast<MemIntrinsicSDNode>(Op);
SmallVector<SDValue, 8> Ops{Store->getChain(), IntID};
Ops.push_back(Val);
Ops.push_back(Op.getOperand(3)); // Ptr
Ops.push_back(Op.getOperand(4)); // Stride
if (!IsUnmasked)
Ops.push_back(Mask);
Ops.push_back(VL);

return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Store->getVTList(),
Ops, Store->getMemoryVT(),
Store->getMemOperand());
}
case Intrinsic::riscv_seg2_store:
case Intrinsic::riscv_seg3_store:
case Intrinsic::riscv_seg4_store:
Expand Down Expand Up @@ -17551,43 +17429,6 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
// By default we do not combine any intrinsic.
default:
return SDValue();
case Intrinsic::riscv_masked_strided_load: {
MVT VT = N->getSimpleValueType(0);
auto *Load = cast<MemIntrinsicSDNode>(N);
SDValue PassThru = N->getOperand(2);
SDValue Base = N->getOperand(3);
SDValue Stride = N->getOperand(4);
SDValue Mask = N->getOperand(5);

// If the stride is equal to the element size in bytes, we can use
// a masked.load.
const unsigned ElementSize = VT.getScalarStoreSize();
if (auto *StrideC = dyn_cast<ConstantSDNode>(Stride);
StrideC && StrideC->getZExtValue() == ElementSize)
return DAG.getMaskedLoad(VT, DL, Load->getChain(), Base,
DAG.getUNDEF(XLenVT), Mask, PassThru,
Load->getMemoryVT(), Load->getMemOperand(),
ISD::UNINDEXED, ISD::NON_EXTLOAD);
return SDValue();
}
case Intrinsic::riscv_masked_strided_store: {
auto *Store = cast<MemIntrinsicSDNode>(N);
SDValue Value = N->getOperand(2);
SDValue Base = N->getOperand(3);
SDValue Stride = N->getOperand(4);
SDValue Mask = N->getOperand(5);

// If the stride is equal to the element size in bytes, we can use
// a masked.store.
const unsigned ElementSize = Value.getValueType().getScalarStoreSize();
if (auto *StrideC = dyn_cast<ConstantSDNode>(Stride);
StrideC && StrideC->getZExtValue() == ElementSize)
return DAG.getMaskedStore(Store->getChain(), DL, Value, Base,
DAG.getUNDEF(XLenVT), Mask,
Value.getValueType(), Store->getMemOperand(),
ISD::UNINDEXED, false);
return SDValue();
}
case Intrinsic::riscv_vcpop:
case Intrinsic::riscv_vcpop_mask:
case Intrinsic::riscv_vfirst:
Expand Down
32 changes: 1 addition & 31 deletions llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
Original file line number Diff line number Diff line change
Expand Up @@ -41,24 +41,6 @@ multiclass VPatUSLoadStoreSDNode<ValueType type,
(store_instr reg_class:$rs2, GPR:$rs1, avl, log2sew)>;
}

multiclass VPatUSLoadStoreWholeVRSDNode<ValueType type,
int log2sew,
LMULInfo vlmul,
VReg reg_class,
int sew = !shl(1, log2sew)> {
defvar load_instr =
!cast<Instruction>("VL"#!substr(vlmul.MX, 1)#"RE"#sew#"_V");
defvar store_instr =
!cast<Instruction>("VS"#!substr(vlmul.MX, 1)#"R_V");

// Load
def : Pat<(type (load GPR:$rs1)),
(load_instr GPR:$rs1)>;
// Store
def : Pat<(store type:$rs2, GPR:$rs1),
(store_instr reg_class:$rs2, GPR:$rs1)>;
}

multiclass VPatUSLoadStoreMaskSDNode<MTypeInfo m> {
defvar load_instr = !cast<Instruction>("PseudoVLM_V_"#m.BX);
defvar store_instr = !cast<Instruction>("PseudoVSM_V_"#m.BX);
Expand Down Expand Up @@ -895,23 +877,11 @@ multiclass VPatAVGADD_VV_VX_RM<SDNode vop, int vxrm, string suffix = ""> {
//===----------------------------------------------------------------------===//

// 7.4. Vector Unit-Stride Instructions
foreach vti = !listconcat(FractionalGroupIntegerVectors,
FractionalGroupFloatVectors,
FractionalGroupBFloatVectors) in
foreach vti = AllVectors in
let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
GetVTypePredicates<vti>.Predicates) in
defm : VPatUSLoadStoreSDNode<vti.Vector, vti.Log2SEW, vti.LMul,
vti.AVL, vti.RegClass>;
foreach vti = [VI8M1, VI16M1, VI32M1, VI64M1, VBF16M1, VF16M1, VF32M1, VF64M1] in
let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
GetVTypePredicates<vti>.Predicates) in
defm : VPatUSLoadStoreWholeVRSDNode<vti.Vector, vti.Log2SEW, vti.LMul,
vti.RegClass>;
foreach vti = !listconcat(GroupIntegerVectors, GroupFloatVectors, GroupBFloatVectors) in
let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
GetVTypePredicates<vti>.Predicates) in
defm : VPatUSLoadStoreWholeVRSDNode<vti.Vector, vti.Log2SEW, vti.LMul,
vti.RegClass>;
foreach mti = AllMasks in
let Predicates = [HasVInstructions] in
defm : VPatUSLoadStoreMaskSDNode<mti>;
Expand Down
54 changes: 54 additions & 0 deletions llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class RISCVVectorPeephole : public MachineFunctionPass {

private:
bool convertToVLMAX(MachineInstr &MI) const;
bool convertToWholeRegister(MachineInstr &MI) const;
bool convertToUnmasked(MachineInstr &MI) const;
bool convertVMergeToVMv(MachineInstr &MI) const;

Expand Down Expand Up @@ -155,6 +156,58 @@ bool RISCVVectorPeephole::isAllOnesMask(const MachineInstr *MaskDef) const {
}
}

/// Convert unit strided unmasked loads and stores to whole-register equivalents
/// to avoid the dependency on $vl and $vtype.
///
/// %x = PseudoVLE8_V_M1 %passthru, %ptr, %vlmax, policy
/// PseudoVSE8_V_M1 %v, %ptr, %vlmax
///
/// ->
///
/// %x = VL1RE8_V %ptr
/// VS1R_V %v, %ptr
bool RISCVVectorPeephole::convertToWholeRegister(MachineInstr &MI) const {
#define CASE_WHOLE_REGISTER_LMUL_SEW(lmul, sew) \
case RISCV::PseudoVLE##sew##_V_M##lmul: \
NewOpc = RISCV::VL##lmul##RE##sew##_V; \
break; \
case RISCV::PseudoVSE##sew##_V_M##lmul: \
NewOpc = RISCV::VS##lmul##R_V; \
break;
#define CASE_WHOLE_REGISTER_LMUL(lmul) \
CASE_WHOLE_REGISTER_LMUL_SEW(lmul, 8) \
CASE_WHOLE_REGISTER_LMUL_SEW(lmul, 16) \
CASE_WHOLE_REGISTER_LMUL_SEW(lmul, 32) \
CASE_WHOLE_REGISTER_LMUL_SEW(lmul, 64)

unsigned NewOpc;
switch (MI.getOpcode()) {
CASE_WHOLE_REGISTER_LMUL(1)
CASE_WHOLE_REGISTER_LMUL(2)
CASE_WHOLE_REGISTER_LMUL(4)
CASE_WHOLE_REGISTER_LMUL(8)
default:
return false;
}

MachineOperand &VLOp = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
if (!VLOp.isImm() || VLOp.getImm() != RISCV::VLMaxSentinel)
return false;

// Whole register instructions aren't pseudos so they don't have
// policy/SEW/AVL ops, and they don't have passthrus.
if (RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags))
MI.removeOperand(RISCVII::getVecPolicyOpNum(MI.getDesc()));
MI.removeOperand(RISCVII::getSEWOpNum(MI.getDesc()));
MI.removeOperand(RISCVII::getVLOpNum(MI.getDesc()));
if (RISCVII::isFirstDefTiedToFirstUse(MI.getDesc()))
MI.removeOperand(1);

MI.setDesc(TII->get(NewOpc));

return true;
}

// Transform (VMERGE_VVM_<LMUL> false, false, true, allones, vl, sew) to
// (VMV_V_V_<LMUL> false, true, vl, sew). It may decrease uses of VMSET.
bool RISCVVectorPeephole::convertVMergeToVMv(MachineInstr &MI) const {
Expand Down Expand Up @@ -281,6 +334,7 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) {
for (MachineInstr &MI : MBB) {
Changed |= convertToVLMAX(MI);
Changed |= convertToUnmasked(MI);
Changed |= convertToWholeRegister(MI);
Changed |= convertVMergeToVMv(MI);
}
}
Expand Down
98 changes: 18 additions & 80 deletions llvm/lib/TargetParser/Host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@
#if defined(__sun__) && defined(__svr4__)
#include <kstat.h>
#endif
#if defined(__GNUC__) || defined(__clang__)
#if (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
#include <cpuid.h>
#endif
#endif

#define DEBUG_TYPE "host-detection"

Expand Down Expand Up @@ -522,68 +527,15 @@ StringRef sys::detail::getHostCPUNameForBPF() {
#endif
}

#if defined(__i386__) || defined(_M_IX86) || \
defined(__x86_64__) || defined(_M_X64)

// The check below for i386 was copied from clang's cpuid.h (__get_cpuid_max).
// Check motivated by bug reports for OpenSSL crashing on CPUs without CPUID
// support. Consequently, for i386, the presence of CPUID is checked first
// via the corresponding eflags bit.
// Removal of cpuid.h header motivated by PR30384
// Header cpuid.h and method __get_cpuid_max are not used in llvm, clang, openmp
// or test-suite, but are used in external projects e.g. libstdcxx
static bool isCpuIdSupported() {
#if defined(__GNUC__) || defined(__clang__)
#if defined(__i386__)
int __cpuid_supported;
__asm__(" pushfl\n"
" popl %%eax\n"
" movl %%eax,%%ecx\n"
" xorl $0x00200000,%%eax\n"
" pushl %%eax\n"
" popfl\n"
" pushfl\n"
" popl %%eax\n"
" movl $0,%0\n"
" cmpl %%eax,%%ecx\n"
" je 1f\n"
" movl $1,%0\n"
"1:"
: "=r"(__cpuid_supported)
:
: "eax", "ecx");
if (!__cpuid_supported)
return false;
#endif
return true;
#endif
return true;
}
#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
defined(_M_X64)

/// getX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in
/// the specified arguments. If we can't run cpuid on the host, return true.
static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
unsigned *rECX, unsigned *rEDX) {
#if defined(__GNUC__) || defined(__clang__)
#if defined(__x86_64__)
// gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
// FIXME: should we save this for Clang?
__asm__("movq\t%%rbx, %%rsi\n\t"
"cpuid\n\t"
"xchgq\t%%rbx, %%rsi\n\t"
: "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
: "a"(value));
return false;
#elif defined(__i386__)
__asm__("movl\t%%ebx, %%esi\n\t"
"cpuid\n\t"
"xchgl\t%%ebx, %%esi\n\t"
: "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
: "a"(value));
return false;
#else
return true;
#endif
#if (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
return !__get_cpuid(value, rEAX, rEBX, rECX, rEDX);
#elif defined(_MSC_VER)
// The MSVC intrinsic is portable across x86 and x64.
int registers[4];
Expand All @@ -610,9 +562,6 @@ VendorSignatures getVendorSignature(unsigned *MaxLeaf) {
else
*MaxLeaf = 0;

if (!isCpuIdSupported())
return VendorSignatures::UNKNOWN;

if (getX86CpuIDAndInfo(0, MaxLeaf, &EBX, &ECX, &EDX) || *MaxLeaf < 1)
return VendorSignatures::UNKNOWN;

Expand Down Expand Up @@ -640,26 +589,12 @@ using namespace llvm::sys::detail::x86;
static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
unsigned *rEAX, unsigned *rEBX, unsigned *rECX,
unsigned *rEDX) {
#if defined(__GNUC__) || defined(__clang__)
#if defined(__x86_64__)
// gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
// FIXME: should we save this for Clang?
__asm__("movq\t%%rbx, %%rsi\n\t"
"cpuid\n\t"
"xchgq\t%%rbx, %%rsi\n\t"
: "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
: "a"(value), "c"(subleaf));
return false;
#elif defined(__i386__)
__asm__("movl\t%%ebx, %%esi\n\t"
"cpuid\n\t"
"xchgl\t%%ebx, %%esi\n\t"
: "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
: "a"(value), "c"(subleaf));
return false;
#else
return true;
#endif
// TODO(boomanaiden154): When the minimum toolchain versions for gcc and clang
// are such that __cpuidex is defined within cpuid.h for both, we can remove
// the __get_cpuid_count function and share the MSVC implementation between
// all three.
#if (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
return !__get_cpuid_count(value, subleaf, rEAX, rEBX, rECX, rEDX);
#elif defined(_MSC_VER)
int registers[4];
__cpuidex(registers, value, subleaf);
Expand All @@ -675,6 +610,9 @@ static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,

// Read control register 0 (XCR0). Used to detect features such as AVX.
static bool getX86XCR0(unsigned *rEAX, unsigned *rEDX) {
// TODO(boomanaiden154): When the minimum toolchain versions for gcc and clang
// are such that _xgetbv is supported by both, we can unify the implementation
// with MSVC and remove all inline assembly.
#if defined(__GNUC__) || defined(__clang__)
// Check xgetbv; this uses a .byte sequence instead of the instruction
// directly because older assemblers do not include support for xgetbv and
Expand Down
35 changes: 35 additions & 0 deletions llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3018,6 +3018,37 @@ void LibCallSimplifier::classifyArgUse(
}
}

/// Constant folds remquo
Value *LibCallSimplifier::optimizeRemquo(CallInst *CI, IRBuilderBase &B) {
const APFloat *X, *Y;
if (!match(CI->getArgOperand(0), m_APFloat(X)) ||
!match(CI->getArgOperand(1), m_APFloat(Y)))
return nullptr;

APFloat::opStatus Status;
APFloat Quot = *X;
Status = Quot.divide(*Y, APFloat::rmNearestTiesToEven);
if (Status != APFloat::opOK && Status != APFloat::opInexact)
return nullptr;
APFloat Rem = *X;
if (Rem.remainder(*Y) != APFloat::opOK)
return nullptr;

// TODO: We can only keep at least the three of the last bits of x/y
unsigned IntBW = TLI->getIntSize();
APSInt QuotInt(IntBW, /*isUnsigned=*/false);
bool IsExact;
Status =
Quot.convertToInteger(QuotInt, APFloat::rmNearestTiesToEven, &IsExact);
if (Status != APFloat::opOK && Status != APFloat::opInexact)
return nullptr;

B.CreateAlignedStore(
ConstantInt::get(B.getIntNTy(IntBW), QuotInt.getExtValue()),
CI->getArgOperand(2), CI->getParamAlign(2));
return ConstantFP::get(CI->getType(), Rem);
}

//===----------------------------------------------------------------------===//
// Integer Library Call Optimizations
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -3926,6 +3957,10 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
case LibFunc_cabsf:
case LibFunc_cabsl:
return optimizeCAbs(CI, Builder);
case LibFunc_remquo:
case LibFunc_remquof:
case LibFunc_remquol:
return optimizeRemquo(CI, Builder);
default:
return nullptr;
}
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
# RUN: | FileCheck %s

# CHECK: Physical registers are not supported in window scheduling!
# CHECK: The WindowScheduler failed to initialize!

---
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
# RUN: | FileCheck %s
# REQUIRES: asserts

# Test that checks no window scheduler is performed if the II set by pragma was
# enabled

# CHECK: Window scheduling is disabled when llvm.loop.pipeline.initiationinterval is set.

--- |
define void @test_pragma_ii_fail(ptr %a0, i32 %a1) {
b0:
%v0 = icmp sgt i32 %a1, 1
br i1 %v0, label %b1, label %b4

b1: ; preds = %b0
%v1 = load i32, ptr %a0, align 4
%v2 = add i32 %v1, 10
%v4 = add i32 %a1, -1
%cgep = getelementptr i32, ptr %a0, i32 1
br label %b2

b2: ; preds = %b2, %b1
%v5 = phi i32 [ %v12, %b2 ], [ %v4, %b1 ]
%v6 = phi ptr [ %cgep2, %b2 ], [ %cgep, %b1 ]
%v7 = phi i32 [ %v10, %b2 ], [ %v2, %b1 ]
store i32 %v7, ptr %v6, align 4
%v8 = add i32 %v7, 10
%cgep1 = getelementptr i32, ptr %v6, i32 -1
store i32 %v8, ptr %cgep1, align 4
%v10 = add i32 %v7, 10
%v12 = add i32 %v5, -1
%v13 = icmp eq i32 %v12, 0
%cgep2 = getelementptr i32, ptr %v6, i32 1
br i1 %v13, label %b4, label %b2, !llvm.loop !0

b4: ; preds = %b2, %b0
ret void
}

!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.pipeline.initiationinterval", i32 2}
...
---
name: test_pragma_ii_fail
tracksRegLiveness: true
body: |
bb.0.b0:
successors: %bb.1(0x40000000), %bb.3(0x40000000)
liveins: $r0, $r1
%0:intregs = COPY $r1
%1:intregs = COPY $r0
%2:predregs = C2_cmpgti %0, 1
J2_jumpf %2, %bb.3, implicit-def dead $pc
J2_jump %bb.1, implicit-def dead $pc
bb.1.b1:
successors: %bb.2(0x80000000)
%3:intregs, %4:intregs = L2_loadri_pi %1, 4
%5:intregs = A2_addi killed %3, 10
%6:intregs = A2_addi %0, -1
%7:intregs = COPY %6
J2_loop0r %bb.2, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
bb.2.b2 (machine-block-address-taken):
successors: %bb.3(0x04000000), %bb.2(0x7c000000)
%8:intregs = PHI %4, %bb.1, %9, %bb.2
%10:intregs = PHI %5, %bb.1, %11, %bb.2
S2_storeri_io %8, 0, %10
%11:intregs = A2_addi %10, 10
S2_storeri_io %8, -4, %11
%9:intregs = A2_addi %8, 4
ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
J2_jump %bb.3, implicit-def dead $pc
bb.3.b4:
PS_jmpret $r31, implicit-def dead $pc
...
45 changes: 45 additions & 0 deletions llvm/test/CodeGen/Hexagon/swp-ws-zero-cost.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# REQUIRES: asserts
# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
# RUN: | FileCheck %s

# CHECK-NOT: Can't find a valid II. Keep searching...
# CHECK: Start analyzing II
# CHECK: Start scheduling Phis
# CHECK: Current window Offset is {{[0-9]+}} and II is {{[0-9]+}}

---
name: relu
tracksRegLiveness: true
body: |
bb.0:
successors: %bb.2(0x30000000), %bb.1(0x50000000)
liveins: $r0, $r1, $r2
%0:intregs = COPY $r2
%1:intregs = COPY $r1
%2:intregs = COPY $r0
%3:predregs = C2_cmpeqi %2, 0
J2_jumpt killed %3, %bb.2, implicit-def dead $pc
J2_jump %bb.1, implicit-def dead $pc
bb.1:
successors: %bb.3(0x80000000)
%4:hvxvr = V6_vd0
%5:intregs = A2_addi %2, 31
%6:intregs = S2_lsr_i_r %5, 5
%7:intregs = COPY %6
J2_loop0r %bb.3, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
J2_jump %bb.3, implicit-def dead $pc
bb.2:
PS_jmpret $r31, implicit-def dead $pc
bb.3 (machine-block-address-taken):
successors: %bb.3(0x7c000000), %bb.2(0x04000000)
%8:intregs = PHI %1, %bb.1, %9, %bb.3
%10:intregs = PHI %0, %bb.1, %14, %bb.3
%11:hvxvr, %9:intregs = V6_vL32b_pi %8, 128
%12:intregs = COPY %10
%13:hvxvr = V6_vmaxw killed %11, %4
%14:intregs = V6_vS32b_pi %12, 128, killed %13
ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
J2_jump %bb.2, implicit-def dead $pc
...

36 changes: 36 additions & 0 deletions llvm/test/CodeGen/LoongArch/rotl-rotr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,42 @@ define i64 @rotr_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind {
ret i64 %f
}

define signext i32 @rotr_64_trunc_32(i64 %x, i64 %y) nounwind {
; LA32-LABEL: rotr_64_trunc_32:
; LA32: # %bb.0:
; LA32-NEXT: srl.w $a3, $a0, $a2
; LA32-NEXT: xori $a4, $a2, 31
; LA32-NEXT: slli.w $a5, $a1, 1
; LA32-NEXT: sll.w $a4, $a5, $a4
; LA32-NEXT: or $a3, $a3, $a4
; LA32-NEXT: addi.w $a4, $a2, -32
; LA32-NEXT: slti $a5, $a4, 0
; LA32-NEXT: maskeqz $a3, $a3, $a5
; LA32-NEXT: srl.w $a1, $a1, $a4
; LA32-NEXT: masknez $a1, $a1, $a5
; LA32-NEXT: or $a1, $a3, $a1
; LA32-NEXT: sub.w $a3, $zero, $a2
; LA32-NEXT: sll.w $a0, $a0, $a3
; LA32-NEXT: ori $a3, $zero, 32
; LA32-NEXT: sub.w $a2, $a3, $a2
; LA32-NEXT: srai.w $a2, $a2, 31
; LA32-NEXT: and $a0, $a2, $a0
; LA32-NEXT: or $a0, $a1, $a0
; LA32-NEXT: ret
;
; LA64-LABEL: rotr_64_trunc_32:
; LA64: # %bb.0:
; LA64-NEXT: rotr.d $a0, $a0, $a1
; LA64-NEXT: addi.w $a0, $a0, 0
; LA64-NEXT: ret
%z = sub i64 64, %y
%b = lshr i64 %x, %y
%c = shl i64 %x, %z
%d = or i64 %b, %c
%e = trunc i64 %d to i32
ret i32 %e
}

define signext i32 @rotri_i32(i32 signext %a) nounwind {
; LA32-LABEL: rotri_i32:
; LA32: # %bb.0:
Expand Down
16 changes: 0 additions & 16 deletions llvm/test/CodeGen/RISCV/pr89833.ll

This file was deleted.

106 changes: 65 additions & 41 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions llvm/test/CodeGen/RISCV/rvv/pr99782.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ define void @vslidedown() {
; CHECK-LABEL: name: vslidedown
; CHECK: bb.0.entry:
; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI %stack.0.v, 0
; CHECK-NEXT: [[VL8RE8_V:%[0-9]+]]:vrm8 = VL8RE8_V killed [[ADDI]] :: (load (<vscale x 1 x s512>) from %ir.v, align 1)
; CHECK-NEXT: [[PseudoVLE8_V_M8_:%[0-9]+]]:vrm8 = PseudoVLE8_V_M8 $noreg, killed [[ADDI]], -1, 3 /* e8 */, 3 /* ta, ma */ :: (load (<vscale x 1 x s512>) from %ir.v, align 1)
; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI %stack.1, 0
; CHECK-NEXT: VS8R_V killed [[VL8RE8_V]], killed [[ADDI1]] :: (store (<vscale x 1 x s512>) into %stack.1)
; CHECK-NEXT: PseudoVSE8_V_M8 killed [[PseudoVLE8_V_M8_]], killed [[ADDI1]], -1, 3 /* e8 */ :: (store (<vscale x 1 x s512>) into %stack.1)
; CHECK-NEXT: INLINEASM &"vadd.vv $0, $0, $0", 25 /* sideeffect mayload maystore attdialect */, 262166 /* mem:m */, %stack.0.v, 0, 262166 /* mem:m */, %stack.1, 0
; CHECK-NEXT: PseudoRET
entry:
Expand Down
11 changes: 7 additions & 4 deletions llvm/test/CodeGen/RISCV/rvv/rvv-out-arguments.ll
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ define dso_local void @lots_args(i32 signext %x0, i32 signext %x1, <vscale x 16
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a0, s0, a0
; CHECK-NEXT: addi a0, a0, -64
; CHECK-NEXT: vs8r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: sw a2, -36(s0)
; CHECK-NEXT: sw a3, -40(s0)
; CHECK-NEXT: sw a4, -44(s0)
Expand Down Expand Up @@ -85,7 +86,8 @@ define dso_local signext i32 @main() #0 {
; CHECK-NEXT: slli s1, s1, 3
; CHECK-NEXT: sub s1, s0, s1
; CHECK-NEXT: addi s1, s1, -112
; CHECK-NEXT: vs8r.v v8, (s1)
; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
; CHECK-NEXT: vse32.v v8, (s1)
; CHECK-NEXT: li a0, 1
; CHECK-NEXT: sw a0, -76(s0)
; CHECK-NEXT: sw a0, -80(s0)
Expand All @@ -99,7 +101,7 @@ define dso_local signext i32 @main() #0 {
; CHECK-NEXT: sw a0, -112(s0)
; CHECK-NEXT: lw a0, -76(s0)
; CHECK-NEXT: lw a1, -80(s0)
; CHECK-NEXT: vl8re32.v v8, (s1)
; CHECK-NEXT: vle32.v v8, (s1)
; CHECK-NEXT: lw a2, -84(s0)
; CHECK-NEXT: lw a3, -88(s0)
; CHECK-NEXT: lw a4, -92(s0)
Expand All @@ -115,7 +117,8 @@ define dso_local signext i32 @main() #0 {
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: lw a0, -76(s0)
; CHECK-NEXT: lw a1, -80(s0)
; CHECK-NEXT: vl8re32.v v8, (s1)
; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; CHECK-NEXT: vle32.v v8, (s1)
; CHECK-NEXT: lw a2, -84(s0)
; CHECK-NEXT: lw a3, -88(s0)
; CHECK-NEXT: lw a4, -92(s0)
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ define void @vpmerge_vpload_store(<vscale x 2 x i32> %passthru, ptr %p, <vscale
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
; CHECK-NEXT: $v0 = COPY [[COPY1]]
; CHECK-NEXT: [[PseudoVLE32_V_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_M1_MASK [[COPY3]], [[COPY2]], $v0, [[COPY]], 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 8)
; CHECK-NEXT: VS1R_V killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]] :: (store (<vscale x 1 x s64>) into %ir.p)
; CHECK-NEXT: PseudoVSE32_V_M1 killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]], -1, 5 /* e32 */ :: (store (<vscale x 1 x s64>) into %ir.p)
; CHECK-NEXT: PseudoRET
%a = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr %p, <vscale x 2 x i1> splat (i1 -1), i32 %vl)
%b = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> %m, <vscale x 2 x i32> %a, <vscale x 2 x i32> %passthru, i32 %vl)
Expand All @@ -36,7 +36,7 @@ define void @vpselect_vpload_store(<vscale x 2 x i32> %passthru, ptr %p, <vscale
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
; CHECK-NEXT: $v0 = COPY [[COPY1]]
; CHECK-NEXT: [[PseudoVLE32_V_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_M1_MASK [[COPY3]], [[COPY2]], $v0, [[COPY]], 5 /* e32 */, 1 /* ta, mu */ :: (load unknown-size from %ir.p, align 8)
; CHECK-NEXT: VS1R_V killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]] :: (store (<vscale x 1 x s64>) into %ir.p)
; CHECK-NEXT: PseudoVSE32_V_M1 killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]], -1, 5 /* e32 */ :: (store (<vscale x 1 x s64>) into %ir.p)
; CHECK-NEXT: PseudoRET
%a = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr %p, <vscale x 2 x i1> splat (i1 -1), i32 %vl)
%b = call <vscale x 2 x i32> @llvm.vp.select.nxv2i32(<vscale x 2 x i1> %m, <vscale x 2 x i32> %a, <vscale x 2 x i32> %passthru, i32 %vl)
Expand Down
133 changes: 0 additions & 133 deletions llvm/test/CodeGen/RISCV/rvv/strided-load-store-intrinsics.ll

This file was deleted.

54 changes: 37 additions & 17 deletions llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/vpload.ll
Original file line number Diff line number Diff line change
Expand Up @@ -536,3 +536,14 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
store <vscale x 1 x double> %hi, ptr %out
ret <vscale x 16 x double> %lo
}

define <vscale x 8 x i8> @vpload_all_active_nxv8i8(ptr %ptr) {
; CHECK-LABEL: vpload_all_active_nxv8i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vl1r.v v8, (a0)
; CHECK-NEXT: ret
%vscale = call i32 @llvm.vscale()
%evl = mul i32 %vscale, 8
%load = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
ret <vscale x 8 x i8> %load
}
11 changes: 11 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/vpstore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -459,3 +459,14 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
call void @llvm.vp.store.nxv17f64.p0(<vscale x 17 x double> %val, ptr %ptr, <vscale x 17 x i1> %m, i32 %evl)
ret void
}

define void @vpstore_all_active_nxv8i8(<vscale x 8 x i8> %val, ptr %ptr) {
; CHECK-LABEL: vpstore_all_active_nxv8i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vs1r.v v8, (a0)
; CHECK-NEXT: ret
%vscale = call i32 @llvm.vscale()
%evl = mul i32 %vscale, 8
call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> %val, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
ret void
}
40 changes: 8 additions & 32 deletions llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -487,42 +487,18 @@ define <vscale x 8 x double> @vfmerge_nzv_nxv8f64(<vscale x 8 x double> %va, <vs
define <vscale x 16 x double> @vselect_combine_regression(<vscale x 16 x i64> %va, <vscale x 16 x double> %vb) {
; CHECK-LABEL: vselect_combine_regression:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vmv8r.v v24, v16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, a0, a1
; CHECK-NEXT: vl8re64.v v8, (a1)
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vl8re64.v v8, (a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vmseq.vi v24, v16, 0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmseq.vi v0, v16, 0
; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, mu
; CHECK-NEXT: vmseq.vi v0, v8, 0
; CHECK-NEXT: vmv.v.i v16, 0
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: vmseq.vi v7, v24, 0
; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: vle64.v v8, (a0), v0.t
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vle64.v v16, (a1), v0.t
; CHECK-NEXT: ret
%cond = icmp eq <vscale x 16 x i64> %va, zeroinitializer
%sel = select <vscale x 16 x i1> %cond, <vscale x 16 x double> %vb, <vscale x 16 x double> zeroinitializer
Expand Down
139 changes: 139 additions & 0 deletions llvm/test/Transforms/InstCombine/remquo.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=instcombine -S | FileCheck %s

define float @remquo_f32(ptr %quo) {
; CHECK-LABEL: define float @remquo_f32(
; CHECK-SAME: ptr [[QUO:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: store i32 -2, ptr [[QUO]], align 4
; CHECK-NEXT: ret float 1.000000e+00
;
entry:
%call = call float @remquof(float -5.000000e+00, float 3.000000e+00, ptr %quo)
ret float %call
}

define float @remquo_f32_quo_sign(ptr %quo) {
; CHECK-LABEL: define float @remquo_f32_quo_sign(
; CHECK-SAME: ptr [[QUO:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: store i32 2, ptr [[QUO]], align 4
; CHECK-NEXT: ret float -1.000000e+00
;
entry:
%call = call float @remquof(float 5.000000e+00, float 3.000000e+00, ptr %quo)
ret float %call
}

define float @remquo_f32_round(ptr %quo) {
; CHECK-LABEL: define float @remquo_f32_round(
; CHECK-SAME: ptr [[QUO:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: store i32 -6, ptr [[QUO]], align 4
; CHECK-NEXT: ret float 0xBFC9999900000000
;
entry:
%call = call float @remquof(float -5.000000e+00, float 0x3FE99999A0000000, ptr %quo)
ret float %call
}

define double @remquo_f64(ptr %quo) {
; CHECK-LABEL: define double @remquo_f64(
; CHECK-SAME: ptr [[QUO:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: store i32 -5, ptr [[QUO]], align 4
; CHECK-NEXT: ret double -0.000000e+00
;
entry:
%call = call double @remquo(double -5.000000e+00, double 1.000000e+00, ptr %quo)
ret double %call
}

; Negative tests

define float @remquo_f32_inf_x(ptr %quo) {
; CHECK-LABEL: define float @remquo_f32_inf_x(
; CHECK-SAME: ptr [[QUO:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float 0x7FF0000000000000, float 1.000000e+00, ptr [[QUO]])
; CHECK-NEXT: ret float [[CALL]]
;
entry:
%call = call float @remquof(float 0x7FF0000000000000, float 1.000000e+00, ptr %quo)
ret float %call
}

define float @remquo_f32_zero_y(ptr %quo) {
; CHECK-LABEL: define float @remquo_f32_zero_y(
; CHECK-SAME: ptr [[QUO:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float -5.000000e+00, float 0.000000e+00, ptr [[QUO]])
; CHECK-NEXT: ret float [[CALL]]
;
entry:
%call = call float @remquof(float -5.000000e+00, float 0.000000e+00, ptr %quo)
ret float %call
}

define float @remquo_f32_nzero_y(ptr %quo) {
; CHECK-LABEL: define float @remquo_f32_nzero_y(
; CHECK-SAME: ptr [[QUO:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float -5.000000e+00, float -0.000000e+00, ptr [[QUO]])
; CHECK-NEXT: ret float [[CALL]]
;
entry:
%call = call float @remquof(float -5.000000e+00, float -0.000000e+00, ptr %quo)
ret float %call
}

define float @remquo_f32_nan_x(ptr %quo) {
; CHECK-LABEL: define float @remquo_f32_nan_x(
; CHECK-SAME: ptr [[QUO:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float 0x7FF8000000000000, float 1.000000e+00, ptr [[QUO]])
; CHECK-NEXT: ret float [[CALL]]
;
entry:
%call = call float @remquof(float 0x7FF8000000000000, float 1.000000e+00, ptr %quo)
ret float %call
}

define float @remquo_f32_nan_y(ptr %quo) {
; CHECK-LABEL: define float @remquo_f32_nan_y(
; CHECK-SAME: ptr [[QUO:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float 1.000000e+00, float 0x7FF8000000000000, ptr [[QUO]])
; CHECK-NEXT: ret float [[CALL]]
;
entry:
%call = call float @remquof(float 1.000000e+00, float 0x7FF8000000000000, ptr %quo)
ret float %call
}

define float @remquo_f32_strictfp(ptr %quo) strictfp {
; CHECK-LABEL: define float @remquo_f32_strictfp(
; CHECK-SAME: ptr [[QUO:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float -5.000000e+00, float 3.000000e+00, ptr [[QUO]]) #[[ATTR0]]
; CHECK-NEXT: ret float [[CALL]]
;
entry:
%call = call float @remquof(float -5.000000e+00, float 3.000000e+00, ptr %quo) strictfp
ret float %call
}

define float @remquo_f32_zero_y_strictfp(ptr %quo) strictfp {
; CHECK-LABEL: define float @remquo_f32_zero_y_strictfp(
; CHECK-SAME: ptr [[QUO:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float -5.000000e+00, float 0.000000e+00, ptr [[QUO]]) #[[ATTR0]]
; CHECK-NEXT: ret float [[CALL]]
;
entry:
%call = call float @remquof(float -5.000000e+00, float 0.000000e+00, ptr %quo) strictfp
ret float %call
}

declare float @remquof(float, float, ptr)
declare double @remquo(double, double, ptr)
16 changes: 16 additions & 0 deletions llvm/test/Transforms/InstCombine/remquol-fp128.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=instcombine -S | FileCheck %s

define fp128 @remquo_fp128(ptr %quo) {
; CHECK-LABEL: define fp128 @remquo_fp128(
; CHECK-SAME: ptr [[QUO:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: store i32 -2, ptr [[QUO]], align 4
; CHECK-NEXT: ret fp128 0xL00000000000000003FFF000000000000
;
entry:
%call = call fp128 @remquol(fp128 0xL0000000000000000C001400000000000, fp128 0xL00000000000000004000800000000000, ptr %quo)
ret fp128 %call
}

declare fp128 @remquol(fp128, fp128, ptr)
16 changes: 16 additions & 0 deletions llvm/test/Transforms/InstCombine/remquol-fp80.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=instcombine -S | FileCheck %s

define x86_fp80 @remquo_fp80(ptr %quo) {
; CHECK-LABEL: define x86_fp80 @remquo_fp80(
; CHECK-SAME: ptr [[QUO:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: store i32 -2, ptr [[QUO]], align 4
; CHECK-NEXT: ret x86_fp80 0xK3FFF8000000000000000
;
entry:
%call = call x86_fp80 @remquol(x86_fp80 0xKC001A000000000000000, x86_fp80 0xK4000C000000000000000, ptr %quo)
ret x86_fp80 %call
}

declare x86_fp80 @remquol(x86_fp80, x86_fp80, ptr)
16 changes: 16 additions & 0 deletions llvm/test/Transforms/InstCombine/remquol-ppc-fp128.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=instcombine -S | FileCheck %s

define ppc_fp128 @remquo_ppc_fp128(ptr %quo) {
; CHECK-LABEL: define ppc_fp128 @remquo_ppc_fp128(
; CHECK-SAME: ptr [[QUO:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: store i32 -2, ptr [[QUO]], align 4
; CHECK-NEXT: ret ppc_fp128 0xM3FF00000000000000000000000000000
;
entry:
%call = call ppc_fp128 @remquol(ppc_fp128 0xMC0140000000000000000000000000000, ppc_fp128 0xM40080000000000000000000000000000, ptr %quo)
ret ppc_fp128 %call
}

declare ppc_fp128 @remquol(ppc_fp128, ppc_fp128, ptr)
File renamed without changes.
55 changes: 47 additions & 8 deletions mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,14 @@ static SmallVector<bool> getDimsToReduce(LinalgOp linalgOp) {
llvm::map_range(linalgOp.getIteratorTypesArray(), isReductionIterator));
}

/// Check if `op` is a linalg.reduce or a linalg.generic that has at least one
/// reduction iterator.
static bool hasReductionIterator(LinalgOp &op) {
return isa<linalg::ReduceOp>(op) ||
(isa<linalg::GenericOp>(op) &&
llvm::any_of(op.getIteratorTypesArray(), isReductionIterator));
}

/// Build a vector.transfer_write of `value` into `outputOperand` at indices set
/// to all `0`; where `outputOperand` is an output operand of the LinalgOp
/// currently being vectorized. If `dest` has null rank, build an memref.store.
Expand Down Expand Up @@ -1787,6 +1795,9 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op,
if (isa<ConvolutionOpInterface>(op.getOperation()))
return vectorizeDynamicConvOpPrecondition(op, flatten1DDepthwiseConv);

if (hasReductionIterator(op))
return reductionPreconditions(op);

// TODO: Masking only supports dynamic element-wise ops, linalg.generic ops,
// linalg.copy ops and ops that implement ContractionOpInterface for now.
if (!isElementwise(op) &&
Expand Down Expand Up @@ -1976,6 +1987,7 @@ vectorizeScalableVectorPrecondition(Operation *op,
// 1. exactly 1 dim is scalable and that's the _last_ parallel dim
// 2. exactly 2 dims are scalable and those are the _last two adjacent_
// parallel dims
// 3. exactly 1 reduction dim is scalable and that's the last (innermost) dim
// The 2nd restriction above means that only Matmul-like Ops are supported
// when 2 dims are scalable, e.g. :
// * iterators = [parallel, parallel, reduction]
Expand All @@ -1992,19 +2004,45 @@ vectorizeScalableVectorPrecondition(Operation *op,
scalableFlags.pop_back();
}

// TODO: Support scalable vectorisation for reduction dims
if (iterators.back() == utils::IteratorType::reduction)
return failure();

// If this is not the _last_ parallel dim, 1. above is not met
if (seenParalell)
return failure();
switch (iterators.back()) {
case utils::IteratorType::reduction: {
// Check 3. above is met.
if (iterators.size() != inputVectorSizes.size()) {
LDBG("Non-trailing reduction dim requested for scalable "
"vectorization\n");
return failure();
}
if (isa<linalg::MatmulOp>(op) || isa<linalg::MatmulTransposeAOp>(op)) {
LDBG("Scalable vectorization of the reduction dim in Matmul-like ops "
"is not supported\n");
return failure();
}
break;
}
case utils::IteratorType::parallel: {
// Check 1. and 2. above are met.
if (seenParalell) {
LDBG("Inner parallel dim not requested for scalable "
"vectorization\n");
return failure();
}
break;
}
}

// If present, check the 2nd scalable dim. ATM, only Matmul-like Ops are
// supported for which expect the folowing config:
// * iterators = [parallel, parallel, reduction]
// * scalable flags = [true, true, false]
if (numOfScalableDims == 2) {
// Disallow below case which breaks 3. above:
// * iterators = [..., parallel, reduction]
// * scalable flags = [..., true, true]
if (iterators.back() == utils::IteratorType::reduction) {
LDBG("Higher dim than the trailing reduction dim requested for scalable "
"vectorization\n");
return failure();
}
scalableFlags.pop_back();
iterators.pop_back();

Expand All @@ -2017,7 +2055,8 @@ vectorizeScalableVectorPrecondition(Operation *op,
// presence of scalable vectors
return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
isa<linalg::MatmulTransposeAOp>(op) ||
isa<linalg::DepthwiseConv1DNwcWcOp>(op));
isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
isa<linalg::MatvecOp>(op) || hasReductionIterator(linalgOp));
}

LogicalResult mlir::linalg::vectorizeOpPrecondition(
Expand Down
165 changes: 165 additions & 0 deletions mlir/test/Dialect/Linalg/vectorization-scalable.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,168 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}

// -----

func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor<?xf32>,
%arg1: tensor<f32>) -> tensor<f32> {

%0 = linalg.reduce ins(%arg0 : tensor<?xf32>) outs(%arg1 : tensor<f32>) dimensions = [0]
(%in: f32, %init: f32) {
%0 = arith.addf %in, %init : f32
linalg.yield %0 : f32
}
return %0 : tensor<f32>
}

// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_1d(
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<f32>) -> tensor<f32> {
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1>
// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
// CHECK: %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[VEC_RD_1:.*]] = vector.transfer_read %[[ARG_1]][], %[[C0_F32]] : tensor<f32>, vector<f32>
// CHECK: %[[ACC_f32:.*]] = vector.extractelement %[[VEC_RD_1]][] : vector<f32>
// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK]] { vector.multi_reduction <add>, %[[VEC_RD_0]], %[[ACC_f32]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32
// CHECK: %[[VEC_f32:.*]] = vector.broadcast %[[REDUCE]] : f32 to vector<f32>
// CHECK: %{{.*}} = vector.transfer_write %[[VEC_f32]], %[[ARG_1]][] : vector<f32>, tensor<f32>

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
transform.yield
}
}

// -----

// Note: scalable version of `vectorize_dynamic_reduction` in test/Dialect/Linalg/vectorization.mlir.
func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor<?x?xf32>,
%arg1: tensor<?xf32>) -> tensor<?xf32> {
%0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"] }
ins(%arg0 : tensor<?x?xf32>)
outs(%arg1 : tensor<?xf32>) {
^bb(%in: f32, %out: f32) :
%0 = arith.addf %in, %out : f32
linalg.yield %0 : f32
} -> tensor<?xf32>
return %0 : tensor<?xf32>
}

// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_2d(
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>) -> tensor<?xf32> {
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index
// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[8]xi1>
// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x[8]xf32> } : vector<4x[8]xi1> -> vector<4x[8]xf32>
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_1d:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1>
// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_1d]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[VEC_RD_0]], %[[VEC_RD_1]] [1] : vector<4x[8]xf32> to vector<4xf32> } : vector<4x[8]xi1> -> vector<4xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %{{.*}} = vector.mask %[[MASK_1d]] { vector.transfer_write %[[REDUCE]], %[[ARG_1]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [4, [8]] : !transform.any_op
transform.yield
}
}

// -----

func.func @vectorize_dynamic_matvec_trailing_reduction_dim(%arg0: tensor<?x?xf32>,
%arg1: tensor<?xf32>,
%arg2: tensor<?xf32>) {
linalg.matvec ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?xf32>)
outs(%arg2 : tensor<?xf32>) -> tensor<?xf32>
return
}

// CHECK-LABEL: func.func @vectorize_dynamic_matvec_trailing_reduction_dim(
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) {
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index
// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[4]xi1>
// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x[4]xf32> } : vector<4x[4]xi1> -> vector<4x[4]xf32>
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<[4]xi1>
// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor<?xf32>, vector<4x[4]xf32> } : vector<[4]xi1> -> vector<4x[4]xf32>
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1>
// CHECK: %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
// CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<4x[4]xf32>
// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_RD_2]] [1] : vector<4x[4]xf32> to vector<4xf32> } : vector<4x[4]xi1> -> vector<4xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.matvec"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [4, [4]] : !transform.any_op
transform.yield
}
}

// -----

func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim(%arg0: tensor<?x?xf32>,
%arg1: tensor<?xf32>,
%arg2: tensor<?xf32>) -> tensor<?xf32> {
%0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d1)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"] }
ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?xf32>)
outs(%arg2 : tensor<?xf32>) {
^bb(%mat: f32, %vec: f32, %res: f32) :
%0 = arith.mulf %mat, %vec : f32
%1 = arith.addf %res, %0 : f32
linalg.yield %1 : f32
} -> tensor<?xf32>
return %0 : tensor<?xf32>
}

// CHECK-LABEL: func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim(
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) -> tensor<?xf32> {
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index
// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<[4]x4xi1>
// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<[4]x4xf32> } : vector<[4]x4xi1> -> vector<[4]x4xf32>
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<4xi1>
// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor<?xf32>, vector<[4]x4xf32> } : vector<4xi1> -> vector<[4]x4xf32>
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1>
// CHECK: %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
// CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<[4]x4xf32>
// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_RD_2]] [1] : vector<[4]x4xf32> to vector<[4]xf32> } : vector<[4]x4xi1> -> vector<[4]xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32> } : vector<[4]xi1> -> tensor<?xf32>

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [[4], 4] : !transform.any_op
transform.yield
}
}
84 changes: 73 additions & 11 deletions mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -129,35 +129,35 @@ module attributes {transform.with_named_sequence} {

// -----

func.func @linalg_reduce_scalable(%input: tensor<?xf32>,
%acc: tensor<f32>) -> tensor<f32> {
func.func @linalg_reduce_scalable_leading_dim(%input: tensor<?x?xf32>,
%acc: tensor<?xf32>) -> tensor<?xf32> {

// expected-error @+1 {{Attempted to vectorize, but failed}}
%0 = linalg.reduce ins(%input : tensor<?xf32>) outs(%acc : tensor<f32>) dimensions = [0]
%0 = linalg.reduce ins(%input : tensor<?x?xf32>) outs(%acc : tensor<?xf32>) dimensions = [0]
(%in: f32, %init: f32) {
%0 = arith.addf %in, %init : f32
linalg.yield %0 : f32
}
return %0 : tensor<f32>
return %0 : tensor<?xf32>
}

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op
transform.yield
}
}

// -----

func.func @linalg_generic_scalable_reduction_dim(%input: tensor<?x?xf32>,
%acc: tensor<?xf32>) -> tensor<?xf32> {
func.func @linalg_generic_reduction_scalable_leading_dim(%input: tensor<?x?xf32>,
%acc: tensor<?xf32>) -> tensor<?xf32> {

// expected-error @+1 {{Attempted to vectorize, but failed}}
%0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"] }
affine_map<(d0, d1) -> (d1)>],
iterator_types = ["reduction", "parallel"] }
ins(%input : tensor<?x?xf32>)
outs(%acc : tensor<?xf32>) {
^bb(%in: f32, %out: f32) :
Expand All @@ -170,7 +170,24 @@ func.func @linalg_generic_scalable_reduction_dim(%input: tensor<?x?xf32>,
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op
transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op
transform.yield
}
}

// -----

func.func @linalg_matvec_scalable_two_dims(%A: memref<?x?xf32>, %B: memref<?xf32>, %C: memref<?xf32>) {
// expected-error @+1 {{Attempted to vectorize, but failed}}
linalg.matvec ins(%A, %B: memref<?x?xf32>, memref<?xf32>)
outs(%C: memref<?xf32>)
return
}

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%matmul = transform.structured.match ops{["linalg.matvec"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %matmul vector_sizes [[4], [4]] : !transform.any_op
transform.yield
}
}
Expand All @@ -180,7 +197,7 @@ module attributes {transform.with_named_sequence} {
func.func @linalg_matmul_scalable_leading_parallel_dim(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
// expected-error @+1 {{Attempted to vectorize, but failed}}
linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
outs(%C: memref<?x?xf32>)
outs(%C: memref<?x?xf32>)
return
}

Expand All @@ -191,3 +208,48 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}

// -----

func.func @linalg_matmul_scalable_trailing_reduction_dim(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
// expected-error @+1 {{Attempted to vectorize, but failed}}
linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
outs(%C: memref<?x?xf32>)
return
}

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %matmul vector_sizes [8, 16, [4]] : !transform.any_op
transform.yield
}
}

// -----

func.func @linalg_generic_matmul_scalable_two_trailing_dims(%A: tensor<?x64xf32>, %B: tensor<64x?xf32>,
%C: tensor<?x?xf32>) -> tensor<?x?xf32> {

// expected-error @+1 {{Attempted to vectorize, but failed}}
%0 = linalg.generic { indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
affine_map<(d0, d1, d2) -> (d2, d1)>,
affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"] }
ins(%A, %B : tensor<?x64xf32>, tensor<64x?xf32>)
outs(%C: tensor<?x?xf32>) {
^bb(%in1: f32, %in2: f32, %out: f32) :
%0 = arith.mulf %in1, %in2 : f32
%1 = arith.addf %0, %out : f32
linalg.yield %1 : f32
} -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>
}

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [2, [4], [4]] : !transform.any_op
transform.yield
}
}
175 changes: 175 additions & 0 deletions mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
// DEFINE: %{compile} = mlir-opt %s \
// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \
// DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
// DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
// DEFINE: %{entry_point} = reduce_1d_f32
// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
// DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext

// RUN: %{compile}

// RUN: %{run} | FileCheck %s --check-prefix=REDUCE-F32

// REDEFINE: %{entry_point} = reduce_1d_i32
// RUN: %{run} | FileCheck %s --check-prefix=REDUCE-I32

// REDEFINE: %{entry_point} = generic_reduce_1d_f32
// RUN: %{run} | FileCheck %s --check-prefix=GENERIC-F32

func.func @reduce_1d_f32() {
// 1-D Tensor
%N = arith.constant 1000 : index
%c0_f32 = arith.constant 0.0 : f32

// Allocate the input and output tensors
%A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32>
%C_alloc = bufferization.alloc_tensor() : tensor<f32>

// Initialise the tensors
%pi = arith.constant 3.1416 : f32
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
%C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>

// Reduce
%C_out = linalg.reduce ins(%A_in : tensor<?xf32>) outs(%C_in: tensor<f32>) dimensions = [0]
(%in: f32, %init: f32) {
%0 = arith.addf %in, %init : f32
linalg.yield %0 : f32
}

// Print and verify the output
// REDUCE-F32-LABEL: SVE: START OF TEST OUTPUT
vector.print str "SVE: START OF TEST OUTPUT\n"

// REDUCE-F32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
// REDUCE-F32-NEXT: [3141.6]

%xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()

// REDUCE-F32-NEXT: SVE: END OF TEST OUTPUT
vector.print str "SVE: END OF TEST OUTPUT\n"

return
}

func.func @reduce_1d_i32() {
// 1-D Tensor
%N = arith.constant 1000 : index
%c0_i32 = arith.constant 0 : i32

// Allocate the input and output tensors
%A_alloc = bufferization.alloc_tensor(%N) : tensor<?xi32>
%C_alloc = bufferization.alloc_tensor() : tensor<i32>

// Initialise the tensors
%pi = arith.constant 3 : i32
%A_in = linalg.fill ins(%pi : i32) outs(%A_alloc : tensor<?xi32>) -> tensor<?xi32>
%C_in = tensor.insert %c0_i32 into %C_alloc[] : tensor<i32>

// Reduce
%C_out = linalg.reduce ins(%A_in : tensor<?xi32>) outs(%C_in: tensor<i32>) dimensions = [0]
(%in: i32, %init: i32) {
%0 = arith.addi %in, %init : i32
linalg.yield %0 : i32
}

// Print and verify the output
// REDUCE-I32-LABEL: SVE: START OF TEST OUTPUT
vector.print str "SVE: START OF TEST OUTPUT\n"

// REDUCE-I32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
// REDUCE-I32-NEXT: [3000]

%xf = tensor.cast %C_out : tensor<i32> to tensor<*xi32>
call @printMemrefI32(%xf) : (tensor<*xi32>) -> ()

// REDUCE-I32-NEXT: SVE: END OF TEST OUTPUT
vector.print str "SVE: END OF TEST OUTPUT\n"

return
}

func.func @generic_reduce_1d_f32() {
// 1-D Tensor
%N = arith.constant 1000 : index
%c0_f32 = arith.constant 0.0 : f32

// Allocate the input and output tensors
%A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32>
%C_alloc = bufferization.alloc_tensor() : tensor<f32>

// Initialise the tensors
%pi = arith.constant 3.1416 : f32
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
%C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>

// Reduce
%C_out = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>,
affine_map<(d0) -> ()>],
iterator_types = ["reduction"] }
ins(%A_in : tensor<?xf32>)
outs(%C_in : tensor<f32>) {
^bb(%in: f32, %out: f32) :
%0 = arith.addf %in, %out : f32
linalg.yield %0 : f32
} -> tensor<f32>

// Print and verify the output
// GENERIC-F32-LABEL: SVE: START OF TEST OUTPUT
vector.print str "SVE: START OF TEST OUTPUT\n"

// GENERIC-F32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
// GENERIC-F32-NEXT: [3141.6]

%xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()

// GENERIC-F32-NEXT: SVE: END OF TEST OUTPUT
vector.print str "SVE: END OF TEST OUTPUT\n"

return
}

module attributes {transform.with_named_sequence} {
// A sequence that will tile and vectorise a Reduce Op
transform.named_sequence @tile_and_vectorize_reduce(%func
: !transform.op<"func.func"> {transform.readonly}) {

// Step 0: Get a handle to the reduce Op
%reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func
: (!transform.op<"func.func">) -> !transform.any_op

// Step 1: Tile
%tiled_reduce, %loops:1 = transform.structured.tile_using_for %reduce tile_sizes [[4]]
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)

// Step 2: Vectorize
transform.structured.vectorize %tiled_reduce vector_sizes [[4]] : !transform.any_op

// Step 3: Lower vector.multi_reduction
transform.apply_patterns to %func {
transform.apply_patterns.vector.lower_masked_transfers
transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
} : !transform.op<"func.func">

transform.yield
}

// A sequence that goes over all functions in tis module and applies
// "tile_and_vectorize_reduce"
transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
%funcs = transform.structured.match ops{["func.func"]} in %module
: (!transform.any_op) -> !transform.op<"func.func">

transform.foreach %funcs : !transform.op<"func.func"> {
^bb2(%func : !transform.op<"func.func">):
transform.include @tile_and_vectorize_reduce failures(propagate)
(%func) : (!transform.op<"func.func">) -> ()
}
transform.yield
}
}

func.func private @printMemrefF32(%ptr : tensor<*xf32>)
func.func private @printMemrefI32(%ptr : tensor<*xi32>)
180 changes: 180 additions & 0 deletions mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
// DEFINE: %{compile} = mlir-opt %s \
// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \
// DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
// DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
// DEFINE: %{entry_point} = reduce_2d_f32
// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
// DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext

// RUN: %{compile}

// RUN: %{run} | FileCheck %s --check-prefix=REDUCE

// REDEFINE: %{entry_point} = generic_reduce_2d_f32
// RUN: %{run} | FileCheck %s --check-prefix=GENERIC

func.func @reduce_2d_f32() {
// 2-D Tensor
%M = arith.constant 16 : index
%N = arith.constant 1000 : index
%c0_f32 = arith.constant 0.0 : f32

// Allocate the input and output tensors
%A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32>
%C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32>

// Initialise the tensors
%pi = arith.constant 3.1416 : f32
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
%C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32>

// Reduce
%C_out = linalg.reduce ins(%A_in : tensor<?x?xf32>) outs(%C_in: tensor<?xf32>) dimensions = [1]
(%in: f32, %init: f32) {
%0 = arith.addf %in, %init : f32
linalg.yield %0 : f32
}

// Print and verify the output
// REDUCE-LABEL: SVE: START OF TEST OUTPUT
vector.print str "SVE: START OF TEST OUTPUT\n"

// REDUCE-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
// REDUCE-NEXT: [3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6]

%xf = tensor.cast %C_out : tensor<?xf32> to tensor<*xf32>
call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()

// REDUCE-NEXT: SVE: END OF TEST OUTPUT
vector.print str "SVE: END OF TEST OUTPUT\n"

return
}

func.func @generic_reduce_2d_f32() {
// 2-D Tensor
%M = arith.constant 16 : index
%N = arith.constant 1000 : index
%c0_f32 = arith.constant 0.0 : f32

// Allocate the input and output tensors
%A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32>
%C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32>

// Initialise the tensors
%pi = arith.constant 3.1416 : f32
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
%C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32>

// Reduce
%C_out = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"] }
ins(%A_in : tensor<?x?xf32>)
outs(%C_in : tensor<?xf32>) {
^bb(%in: f32, %out: f32) :
%0 = arith.addf %in, %out : f32
linalg.yield %0 : f32
} -> tensor<?xf32>

// Print and verify the output
// GENERIC-LABEL: SVE: START OF TEST OUTPUT
vector.print str "SVE: START OF TEST OUTPUT\n"

// GENERIC-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
// GENERIC-NEXT: [3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6]

%xf = tensor.cast %C_out : tensor<?xf32> to tensor<*xf32>
call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()

// GENERIC-NEXT: SVE: END OF TEST OUTPUT
vector.print str "SVE: END OF TEST OUTPUT\n"

return
}

func.func @generic_reduce_2d_i32() {
// 2-D Tensor
%M = arith.constant 16 : index
%N = arith.constant 1000 : index
%c0_i32 = arith.constant 0 : i32

// Allocate the input and output tensors
%A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xi32>
%C_alloc = bufferization.alloc_tensor(%M) : tensor<?xi32>

// Initialise the tensors
%pi = arith.constant 3 : i32
%A_in = linalg.fill ins(%pi : i32) outs(%A_alloc : tensor<?x?xi32>) -> tensor<?x?xi32>
%C_in = linalg.fill ins(%c0_i32 : i32) outs(%C_alloc : tensor<?xi32>) -> tensor<?xi32>

// Reduce
%C_out = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"] }
ins(%A_in : tensor<?x?xi32>)
outs(%C_in : tensor<?xi32>) {
^bb(%in: i32, %out: i32) :
%0 = arith.addi %in, %out : i32
linalg.yield %0 : i32
} -> tensor<?xi32>

// Print and verify the output
// GENERIC-I32-LABEL: SVE: START OF TEST OUTPUT
vector.print str "SVE: START OF TEST OUTPUT\n"

// GENERIC-I32-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
// GENERIC-I32-NEXT: [3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000]

%xf = tensor.cast %C_out : tensor<?xi32> to tensor<*xi32>
call @printMemrefI32(%xf) : (tensor<*xi32>) -> ()

// GENERIC-I32-NEXT: SVE: END OF TEST OUTPUT
vector.print str "SVE: END OF TEST OUTPUT\n"

return
}


module attributes {transform.with_named_sequence} {
// A sequence that will tile and vectorise a Reduce Op
transform.named_sequence @tile_and_vectorize_reduce(%func
: !transform.op<"func.func"> {transform.readonly}) {

// Step 0: Get a handle to the reduce Op
%reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func
: (!transform.op<"func.func">) -> !transform.any_op

// Step 1: Tile
%tiled_reduce, %loops:2 = transform.structured.tile_using_for %reduce tile_sizes [1, [4]]
: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)

// Step 2: Vectorize
transform.structured.vectorize %tiled_reduce vector_sizes [1, [4]] : !transform.any_op

// Step 3: Lower vector.multi_reduction
transform.apply_patterns to %func {
transform.apply_patterns.vector.lower_masked_transfers
transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
} : !transform.op<"func.func">

transform.yield
}

// A sequence that goes over all functions in tis module and applies
// "tile_and_vectorize_reduce"
transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
%funcs = transform.structured.match ops{["func.func"]} in %module
: (!transform.any_op) -> !transform.op<"func.func">

transform.foreach %funcs : !transform.op<"func.func"> {
^bb2(%func : !transform.op<"func.func">):
transform.include @tile_and_vectorize_reduce failures(propagate)
(%func) : (!transform.op<"func.func">) -> ()
}
transform.yield
}
}

func.func private @printMemrefF32(%ptr : tensor<*xf32>)
func.func private @printMemrefI32(%ptr : tensor<*xi32>)