Skip to content

Commit

Permalink
[RISCV][CostModel] Model vrgather.vv as being quadradic in LMUL
Browse files Browse the repository at this point in the history
vrgather.vv across multiple vector registers (i.e. LMUL > 1) requires all to all data movement. This includes two conceptual sets of changes:

    For permutes, we were modeling these as being linear in LMUL.
    For reverse, we were modeling them as being fixed cost in LMUL.

Both were wrong, and have been adjusted to O(LMUL^2).  Noticed via code inspection while looking at something else.

Its worth asking whether we should be lowering reverse to something other than a vrgather at high LMULs. That shuffle is quite expensive.  (Future work)

Differential Revision: https://reviews.llvm.org/D152019
  • Loading branch information
preames committed Jul 18, 2023
1 parent d2ac006 commit 7cc6b80
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 106 deletions.
26 changes: 16 additions & 10 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,12 @@ static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST,
return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
}

/// Return the cost of a vrgather.vv instruction for the type VT. vrgather.vv
/// is generally quadratic in the number of vreg implied by LMUL. Note that
/// operand (index and possibly mask) are handled separately.
InstructionCost RISCVTTIImpl::getVRGatherVVCost(MVT VT) {
return getLMULCost(VT) * getLMULCost(VT);
}

InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *Tp, ArrayRef<int> Mask,
Expand Down Expand Up @@ -311,7 +317,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
LT.second.getVectorNumElements() <= 256)) {
VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
return IndexCost + getLMULCost(LT.second);
return IndexCost + getVRGatherVVCost(LT.second);
}
}
break;
Expand All @@ -331,7 +337,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);
InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
return 2 * IndexCost + 2 * getLMULCost(LT.second) + MaskCost;
return 2 * IndexCost + 2 * getVRGatherVVCost(LT.second) + MaskCost;
}
}
break;
Expand Down Expand Up @@ -407,11 +413,11 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return 2 * LT.first * getLMULCost(LT.second);
case TTI::SK_Reverse: {
// TODO: Cases to improve here:
// * LMUL > 1
// * Illegal vector types
// * i64 on RV32
// * i1 vector

// Most of the cost here is producing the vrgather index register
// At low LMUL, most of the cost is producing the vrgather index register.
// At high LMUL, the cost of the vrgather itself will dominate.
// Example sequence:
// csrr a0, vlenb
// srli a0, a0, 3
Expand All @@ -420,14 +426,14 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// vid.v v9
// vrsub.vx v10, v9, a0
// vrgather.vv v9, v8, v10
unsigned LenCost = 3;
InstructionCost LenCost = 3;
if (LT.second.isFixedLengthVector())
// vrsub.vi has a 5 bit immediate field, otherwise an li suffices
LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
if (Tp->getElementType()->isIntegerTy(1))
// Mask operation additionally required extend and truncate
return LT.first * (LenCost + 6);
return LT.first * (LenCost + 3);
InstructionCost GatherCost = 2 + getVRGatherVVCost(LT.second);
// Mask operation additionally required extend and truncate
InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
return LT.first * (LenCost + GatherCost + ExtendCost);
}
}
return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
return ST->useRVVForFixedLengthVectors() ? 16 : 0;
}

InstructionCost getVRGatherVVCost(MVT VT);

InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
Expand Down
27 changes: 18 additions & 9 deletions llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,20 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x

define void @vector_reverse() {
; CHECK-LABEL: 'vector_reverse'
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
Expand All @@ -81,6 +84,9 @@ define void @vector_reverse() {
%reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
%reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
%reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
%reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
%reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
%reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
%reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
%reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
%reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
Expand All @@ -98,6 +104,9 @@ declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4
declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
declare <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64>)
declare <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64>)
declare <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64>)
declare <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64>)
declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ define <8 x i32> @interleave2_v8i32(<4 x i32> %v0, <4 x i32> %v1) {
define <8 x i64> @interleave2_v8i64(<4 x i64> %v0, <4 x i64> %v1) {
; RV32-LABEL: 'interleave2_v8i64'
; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = shufflevector <8 x i64> %concat, <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
; RV32-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res = shufflevector <8 x i64> %concat, <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res
;
; RV64-LABEL: 'interleave2_v8i64'
; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; RV64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = shufflevector <8 x i64> %concat, <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %res = shufflevector <8 x i64> %concat, <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res
;
%concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Expand Down
Loading

0 comments on commit 7cc6b80

Please sign in to comment.