diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index a712b4632e9a8..0302d3030375e 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -32,6 +32,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" @@ -41,6 +42,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/MatrixBuilder.h" +#include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/CommandLine.h" @@ -325,6 +327,25 @@ computeShapeInfoForInst(Instruction *I, return OpShape->second; } + if (auto *II = dyn_cast(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmaximum: + case Intrinsic::vector_reduce_fmin: + case Intrinsic::vector_reduce_fminimum: + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + return ShapeInfo(1, 1); + default: + break; + } + } + if (isUniformShape(I) || isa(I)) { auto Ops = I->operands(); auto ShapedOps = isa(I) ? drop_begin(Ops) : Ops; @@ -468,7 +489,7 @@ class LowerMatrixIntrinsics { return make_range(Vectors.begin(), Vectors.end()); } - iterator_range::iterator> vectors() { + iterator_range::const_iterator> vectors() const { return make_range(Vectors.begin(), Vectors.end()); } @@ -701,7 +722,31 @@ class LowerMatrixIntrinsics { case Intrinsic::matrix_transpose: case Intrinsic::matrix_column_major_load: case Intrinsic::matrix_column_major_store: + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmaximum: + case Intrinsic::vector_reduce_fmin: + case Intrinsic::vector_reduce_fminimum: + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: return true; + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: { + FastMathFlags FMF = getFastMathFlags(Inst); + if (Inst->getType()->isFloatingPointTy() && !FMF.allowReassoc()) + return false; + + if (match(Inst, m_Intrinsic( + m_Unless(m_AnyZeroFP()), m_Value()))) + return false; + + if (match(Inst, m_Intrinsic( + m_Unless(m_FPOne()), m_Value()))) + return false; + return true; + } default: return isUniformShape(II); } @@ -1268,6 +1313,113 @@ class LowerMatrixIntrinsics { return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) * Result.getNumVectors()); } + case Intrinsic::vector_reduce_fadd: { + Builder.setFastMathFlags(getFastMathFlags(Inst)); + auto *I = Inst2ColumnMatrix.find(Inst->getOperand(1)); + assert(I != Inst2ColumnMatrix.end()); + const MatrixTy &M = I->second; + + Value *Start = Inst->getOperand(0); + Value *ResultV = Builder.CreateVectorSplat( + ElementCount::getFixed(M.getStride()), Start); + for (auto &Vector : M.vectors()) + ResultV = Builder.CreateFAdd(ResultV, Vector); + + Value *Result = Builder.CreateFAddReduce(Start, ResultV); + Inst->replaceAllUsesWith(Result); + Result->takeName(Inst); + return MatrixTy{Result}; + } break; + case Intrinsic::vector_reduce_fmul: { + Builder.setFastMathFlags(getFastMathFlags(Inst)); + auto *I = Inst2ColumnMatrix.find(Inst->getOperand(1)); + assert(I != Inst2ColumnMatrix.end()); + const MatrixTy &M = I->second; + + Value *Start = Inst->getOperand(0); + Value *ResultV = Builder.CreateVectorSplat( + ElementCount::getFixed(M.getStride()), Start); + for (auto &Vector : M.vectors()) + ResultV = Builder.CreateFMul(ResultV, Vector); + + Value *Result = Builder.CreateFMulReduce(Start, ResultV); + Inst->replaceAllUsesWith(Result); + Result->takeName(Inst); + return MatrixTy{Result}; + } break; + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmaximum: + case Intrinsic::vector_reduce_fmin: + case Intrinsic::vector_reduce_fminimum: + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: { + Builder.setFastMathFlags(getFastMathFlags(Inst)); + auto *I = Inst2ColumnMatrix.find(Inst->getOperand(0)); + assert(I != Inst2ColumnMatrix.end()); + const MatrixTy &M = I->second; + + auto CreateVReduce = [&](Value *LHS, Value *RHS) { + switch (Inst->getIntrinsicID()) { + case Intrinsic::vector_reduce_add: + return Builder.CreateAdd(LHS, RHS); + case Intrinsic::vector_reduce_and: + return Builder.CreateAnd(LHS, RHS); + case Intrinsic::vector_reduce_fmax: + return Builder.CreateMaximum(LHS, RHS); + case Intrinsic::vector_reduce_fmaximum: + return Builder.CreateMaximumNum(LHS, RHS); + case Intrinsic::vector_reduce_fmin: + return Builder.CreateMinimum(LHS, RHS); + case Intrinsic::vector_reduce_fminimum: + return Builder.CreateMinimumNum(LHS, RHS); + case Intrinsic::vector_reduce_mul: + return Builder.CreateMul(LHS, RHS); + case Intrinsic::vector_reduce_or: + return Builder.CreateOr(LHS, RHS); + case Intrinsic::vector_reduce_xor: + return Builder.CreateXor(LHS, RHS); + default: + llvm_unreachable("unexpected intrinsic"); + } + }; + + Value *ResultV = M.getVector(0); + for (auto &Vector : drop_begin(M.vectors())) + ResultV = CreateVReduce(ResultV, Vector); + + auto CreateHReduce = [&](Value *V) { + switch (Inst->getIntrinsicID()) { + case Intrinsic::vector_reduce_add: + return Builder.CreateAddReduce(V); + case Intrinsic::vector_reduce_and: + return Builder.CreateAndReduce(V); + case Intrinsic::vector_reduce_fmax: + return Builder.CreateFPMaxReduce(V); + case Intrinsic::vector_reduce_fmaximum: + return Builder.CreateFPMaximumReduce(V); + case Intrinsic::vector_reduce_fmin: + return Builder.CreateFPMinReduce(V); + case Intrinsic::vector_reduce_fminimum: + return Builder.CreateFPMinimumReduce(V); + case Intrinsic::vector_reduce_mul: + return Builder.CreateMulReduce(V); + case Intrinsic::vector_reduce_or: + return Builder.CreateOrReduce(V); + case Intrinsic::vector_reduce_xor: + return Builder.CreateXorReduce(V); + default: + llvm_unreachable("unexpected intrinsic"); + } + }; + + Value *Result = CreateHReduce(ResultV); + Inst->replaceAllUsesWith(Result); + Result->takeName(Inst); + return MatrixTy{Result}; + } break; default: break; } diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/reduce.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/reduce.ll new file mode 100644 index 0000000000000..503378bebb85b --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/reduce.ll @@ -0,0 +1,300 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + +define i32 @reduce_add_4x2(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_add_4x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x i32>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x i32>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[COL_LOAD]], [[COL_LOAD1]] +; CHECK-NEXT: [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: ret i32 [[REDUCE]] +; + %inv = call <8 x i32> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call i32 @llvm.vector.reduce.add(<8 x i32> %inv) + ret i32 %reduce +} + +define i32 @reduce_add_8x1(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_add_8x1( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <8 x i32>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[COL_LOAD]]) +; CHECK-NEXT: ret i32 [[REDUCE]] +; + %inv = call <8 x i32> @llvm.matrix.column.major.load(ptr %in, i64 8, i1 1, i32 8, i32 1) + %reduce = call i32 @llvm.vector.reduce.add(<8 x i32> %inv) + ret i32 %reduce +} + +define i32 @reduce_add_1x8(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_add_1x8( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <1 x i32>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 1 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[IN]], i64 2 +; CHECK-NEXT: [[COL_LOAD3:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP2]], align 4 +; CHECK-NEXT: [[VEC_GEP4:%.*]] = getelementptr i32, ptr [[IN]], i64 3 +; CHECK-NEXT: [[COL_LOAD5:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP4]], align 4 +; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr i32, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD7:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP6]], align 4 +; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr i32, ptr [[IN]], i64 5 +; CHECK-NEXT: [[COL_LOAD9:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP8]], align 4 +; CHECK-NEXT: [[VEC_GEP10:%.*]] = getelementptr i32, ptr [[IN]], i64 6 +; CHECK-NEXT: [[COL_LOAD11:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP10]], align 4 +; CHECK-NEXT: [[VEC_GEP12:%.*]] = getelementptr i32, ptr [[IN]], i64 7 +; CHECK-NEXT: [[COL_LOAD13:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP12]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add <1 x i32> [[COL_LOAD]], [[COL_LOAD1]] +; CHECK-NEXT: [[TMP2:%.*]] = add <1 x i32> [[TMP1]], [[COL_LOAD3]] +; CHECK-NEXT: [[TMP3:%.*]] = add <1 x i32> [[TMP2]], [[COL_LOAD5]] +; CHECK-NEXT: [[TMP4:%.*]] = add <1 x i32> [[TMP3]], [[COL_LOAD7]] +; CHECK-NEXT: [[TMP5:%.*]] = add <1 x i32> [[TMP4]], [[COL_LOAD9]] +; CHECK-NEXT: [[TMP6:%.*]] = add <1 x i32> [[TMP5]], [[COL_LOAD11]] +; CHECK-NEXT: [[TMP7:%.*]] = add <1 x i32> [[TMP6]], [[COL_LOAD13]] +; CHECK-NEXT: [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> [[TMP7]]) +; CHECK-NEXT: ret i32 [[REDUCE]] +; + %inv = call <8 x i32> @llvm.matrix.column.major.load(ptr %in, i64 1, i1 1, i32 1, i32 8) + %reduce = call i32 @llvm.vector.reduce.add(<8 x i32> %inv) + ret i32 %reduce +} + +define i32 @reduce_add_1x3(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_add_1x3( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <1 x i32>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 1 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[IN]], i64 2 +; CHECK-NEXT: [[COL_LOAD3:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add <1 x i32> [[COL_LOAD]], [[COL_LOAD1]] +; CHECK-NEXT: [[TMP2:%.*]] = add <1 x i32> [[TMP1]], [[COL_LOAD3]] +; CHECK-NEXT: [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> [[TMP2]]) +; CHECK-NEXT: ret i32 [[REDUCE]] +; + %inv = call <3 x i32> @llvm.matrix.column.major.load(ptr %in, i64 1, i1 1, i32 1, i32 3) + %reduce = call i32 @llvm.vector.reduce.add(<3 x i32> %inv) + ret i32 %reduce +} + +define i32 @reduce_add_3x1(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_add_3x1( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <3 x i32>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[COL_LOAD]]) +; CHECK-NEXT: ret i32 [[REDUCE]] +; + %inv = call <3 x i32> @llvm.matrix.column.major.load(ptr %in, i64 3, i1 1, i32 3, i32 1) + %reduce = call i32 @llvm.vector.reduce.add(<3 x i32> %inv) + ret i32 %reduce +} + +define i32 @reduce_and(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_and( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x i32>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x i32>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[COL_LOAD]], [[COL_LOAD1]] +; CHECK-NEXT: [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: ret i32 [[REDUCE]] +; + %inv = call <8 x i32> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call i32 @llvm.vector.reduce.and(<8 x i32> %inv) + ret i32 %reduce +} + +define i32 @reduce_or(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_or( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x i32>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x i32>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = or <4 x i32> [[COL_LOAD]], [[COL_LOAD1]] +; CHECK-NEXT: [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: ret i32 [[REDUCE]] +; + %inv = call <8 x i32> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call i32 @llvm.vector.reduce.or(<8 x i32> %inv) + ret i32 %reduce +} + +define i32 @reduce_mul(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_mul( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x i32>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x i32>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[COL_LOAD]], [[COL_LOAD1]] +; CHECK-NEXT: [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: ret i32 [[REDUCE]] +; + %inv = call <8 x i32> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call i32 @llvm.vector.reduce.mul(<8 x i32> %inv) + ret i32 %reduce +} + +define i32 @reduce_xor(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_xor( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x i32>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x i32>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i32> [[COL_LOAD]], [[COL_LOAD1]] +; CHECK-NEXT: [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: ret i32 [[REDUCE]] +; + %inv = call <8 x i32> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call i32 @llvm.vector.reduce.xor(<8 x i32> %inv) + ret i32 %reduce +} + +define float @reduce_fadd(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_fadd( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x float>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x float>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[COL_LOAD]], <4 x float> [[COL_LOAD1]], <8 x i32> +; CHECK-NEXT: [[REDUCE:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; CHECK-NEXT: ret float [[REDUCE]] +; + %inv = call <8 x float> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call float @llvm.vector.reduce.fadd(float 0., <8 x float> %inv) + ret float %reduce +} + +define float @reduce_fadd_reassoc(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_fadd_reassoc( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x float>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x float>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd reassoc <4 x float> zeroinitializer, [[COL_LOAD]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc <4 x float> [[TMP1]], [[COL_LOAD1]] +; CHECK-NEXT: [[REDUCE:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) +; CHECK-NEXT: ret float [[REDUCE]] +; + %inv = call <8 x float> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call reassoc float @llvm.vector.reduce.fadd(float 0., <8 x float> %inv) + ret float %reduce +} + +define float @reduce_fadd_contract(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_fadd_contract( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x float>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x float>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[COL_LOAD]], <4 x float> [[COL_LOAD1]], <8 x i32> +; CHECK-NEXT: [[REDUCE:%.*]] = call contract float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; CHECK-NEXT: ret float [[REDUCE]] +; + %inv = call <8 x float> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call contract float @llvm.vector.reduce.fadd(float 0., <8 x float> %inv) + ret float %reduce +} + +define float @reduce_fadd_reassoccontract(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_fadd_reassoccontract( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x float>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x float>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd reassoc contract <4 x float> zeroinitializer, [[COL_LOAD]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc contract <4 x float> [[TMP1]], [[COL_LOAD1]] +; CHECK-NEXT: [[REDUCE:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) +; CHECK-NEXT: ret float [[REDUCE]] +; + %inv = call <8 x float> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call reassoc contract float @llvm.vector.reduce.fadd(float 0., <8 x float> %inv) + ret float %reduce +} + +define float @reduce_fadd_weirdstart(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_fadd_weirdstart( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x float>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x float>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[COL_LOAD]], <4 x float> [[COL_LOAD1]], <8 x i32> +; CHECK-NEXT: [[REDUCE:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 1.000000e+00, <8 x float> [[TMP1]]) +; CHECK-NEXT: ret float [[REDUCE]] +; + %inv = call <8 x float> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call reassoc float @llvm.vector.reduce.fadd(float 1., <8 x float> %inv) + ret float %reduce +} + +define float @reduce_fmul_reassoc(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_fmul_reassoc( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x float>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x float>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fmul reassoc <4 x float> splat (float 1.000000e+00), [[COL_LOAD]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc <4 x float> [[TMP1]], [[COL_LOAD1]] +; CHECK-NEXT: [[REDUCE:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP2]]) +; CHECK-NEXT: ret float [[REDUCE]] +; + %inv = call <8 x float> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call reassoc float @llvm.vector.reduce.fmul(float 1., <8 x float> %inv) + ret float %reduce +} + +define float @reduce_fmul_weirdstart(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_fmul_weirdstart( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x float>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x float>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[COL_LOAD]], <4 x float> [[COL_LOAD1]], <8 x i32> +; CHECK-NEXT: [[REDUCE:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; CHECK-NEXT: ret float [[REDUCE]] +; + %inv = call <8 x float> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call reassoc float @llvm.vector.reduce.fmul(float 0., <8 x float> %inv) + ret float %reduce +} + +define float @reduce_fmax(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_fmax( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x float>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x float>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.maximum.v4f32(<4 x float> [[COL_LOAD]], <4 x float> [[COL_LOAD1]]) +; CHECK-NEXT: [[REDUCE:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP1]]) +; CHECK-NEXT: ret float [[REDUCE]] +; + %inv = call <8 x float> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call float @llvm.vector.reduce.fmax(<8 x float> %inv) + ret float %reduce +} + +define float @reduce_fmaximum(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_fmaximum( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x float>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x float>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[COL_LOAD]], <4 x float> [[COL_LOAD1]]) +; CHECK-NEXT: [[REDUCE:%.*]] = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> [[TMP1]]) +; CHECK-NEXT: ret float [[REDUCE]] +; + %inv = call <8 x float> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call float @llvm.vector.reduce.fmaximum(<8 x float> %inv) + ret float %reduce +} + +define float @reduce_fmin(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_fmin( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x float>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x float>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.minimum.v4f32(<4 x float> [[COL_LOAD]], <4 x float> [[COL_LOAD1]]) +; CHECK-NEXT: [[REDUCE:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP1]]) +; CHECK-NEXT: ret float [[REDUCE]] +; + %inv = call <8 x float> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call float @llvm.vector.reduce.fmin(<8 x float> %inv) + ret float %reduce +} + +define float @reduce_fminimum(ptr %in, ptr %out) { +; CHECK-LABEL: @reduce_fminimum( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load volatile <4 x float>, ptr [[IN:%.*]], align 4 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 4 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load volatile <4 x float>, ptr [[VEC_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[COL_LOAD]], <4 x float> [[COL_LOAD1]]) +; CHECK-NEXT: [[REDUCE:%.*]] = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> [[TMP1]]) +; CHECK-NEXT: ret float [[REDUCE]] +; + %inv = call <8 x float> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2) + %reduce = call float @llvm.vector.reduce.fminimum(<8 x float> %inv) + ret float %reduce +}