Skip to content

Commit

Permalink
[TTI][AArch64] Cost model insertelement and indexed LD1 instructions
Browse files Browse the repository at this point in the history
An indexed LD1 instruction, or "ASIMD load, 1 element, one lane, B/H/S"
instruction that loads a value and inserts an element into a vector is
an expensive instruction. It has a latency of 8 on modern cores. We
generate an indexed LD1 when an insertelement instruction has a load as an
operand and this patch is recognising and makes indexed LD1 more expensive.

Differential Revision: https://reviews.llvm.org/D141602
  • Loading branch information
sjoerdmeijer committed Feb 9, 2023
1 parent ec094d2 commit 079c488
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 16 deletions.
20 changes: 14 additions & 6 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2184,7 +2184,8 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
return 0;
}

InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val,
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
Type *Val,
unsigned Index,
bool HasRealUse) {
assert(Val->isVectorTy() && "This must be a vector type");
Expand All @@ -2210,14 +2211,21 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val,
// needed. So it has non-zero cost.
// - For the rest of cases (virtual instruction or element type is float),
// consider the instruction free.
//
if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
return 0;

// This is recognising a LD1 single-element structure to one lane of one
// register instruction. I.e., if this is an `insertelement` instruction,
// and its second operand is a load, then we will generate a LD1, which
// are expensive instructions.
if (I && dyn_cast<LoadInst>(I->getOperand(1)))
return ST->getVectorInsertExtractBaseCost() + 1;

// FIXME:
// If the extract-element and insert-element instructions could be
// simplified away (e.g., could be combined into users by looking at use-def
// context), they have no cost. This is not done in the first place for
// compile-time considerations.
if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
return 0;
}

// All other insert/extracts cost this much.
Expand All @@ -2228,14 +2236,14 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index, Value *Op0,
Value *Op1) {
return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */);
return getVectorInstrCostHelper(nullptr, Val, Index, false /* HasRealUse */);
}

InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) {
return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */);
return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
}

InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
// indicates whether the vector instruction is available in the input IR or
// just imaginary in vectorizer passes.
InstructionCost getVectorInstrCostHelper(Type *Val, unsigned Index,
bool HasRealUse);
InstructionCost getVectorInstrCostHelper(const Instruction *I, Type *Val,
unsigned Index, bool HasRealUse);

public:
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,12 @@ define void @vectorInstrCost() {
define <8 x i8> @LD1_B(<8 x i8> %vec, ptr noundef %i) {
; KRYO-LABEL: 'LD1_B'
; KRYO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i8, ptr %i, align 1
; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
; KRYO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %v2
;
; NEO-LABEL: 'LD1_B'
; NEO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i8, ptr %i, align 1
; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
; NEO-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %v2
;
entry:
Expand All @@ -125,12 +125,12 @@ entry:
define <4 x i16> @LD1_H(<4 x i16> %vec, ptr noundef %i) {
; KRYO-LABEL: 'LD1_H'
; KRYO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i16, ptr %i, align 2
; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
; KRYO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %v2
;
; NEO-LABEL: 'LD1_H'
; NEO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i16, ptr %i, align 2
; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
; NEO-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %v2
;
entry:
Expand All @@ -142,12 +142,12 @@ entry:
define <4 x i32> @LD1_W(<4 x i32> %vec, ptr noundef %i) {
; KRYO-LABEL: 'LD1_W'
; KRYO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i32, ptr %i, align 4
; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
; KRYO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v2
;
; NEO-LABEL: 'LD1_W'
; NEO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i32, ptr %i, align 4
; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
; NEO-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v2
;
entry:
Expand All @@ -159,12 +159,12 @@ entry:
define <2 x i64> @LD1_X(<2 x i64> %vec, ptr noundef %i) {
; KRYO-LABEL: 'LD1_X'
; KRYO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i64, ptr %i, align 8
; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
; KRYO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v2
;
; NEO-LABEL: 'LD1_X'
; NEO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i64, ptr %i, align 8
; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
; NEO-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v2
;
entry:
Expand Down

0 comments on commit 079c488

Please sign in to comment.