Skip to content

Commit

Permalink
[ARM] MVE FP16 cost adjustments
Browse files Browse the repository at this point in the history
This adjusts the MVE fp16 cost model, similar to how we already do for
integer casts. It uses the base cost of 1 per cvt for most fp extend /
truncates, but adjusts it for loads and stores where we know that a
extending load has been used to get the load into the correct lane, and
only an MVE VCVTB is then needed.

Differential Revision: https://reviews.llvm.org/D81813
  • Loading branch information
davemgreen committed Jul 6, 2020
1 parent cd9a241 commit 146dad0
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 23 deletions.
39 changes: 39 additions & 0 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
Expand Up @@ -229,6 +229,18 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}

static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
// FPExtends are similar but also require the VCVT instructions.
{ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
{ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
};
if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
if (const auto *Entry =
ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
}

// The truncate of a store is free. This is the mirror of extends above.
Expand All @@ -247,6 +259,17 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
DstTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}

static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
{ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
{ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
};
if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
if (const auto *Entry =
ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, SrcTy.getSimpleVT(),
DstTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
}

// NEON vector operations that can extend their inputs.
Expand Down Expand Up @@ -955,6 +978,22 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
return LT.first * 4;
}

// MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
// Same for stores.
if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
((Opcode == Instruction::Load && I->hasOneUse() &&
isa<FPExtInst>(*I->user_begin())) ||
(Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
Type *DstTy =
Opcode == Instruction::Load
? (*I->user_begin())->getType()
: cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
DstTy->getScalarType()->isFloatTy())
return ST->getMVEVectorCostFactor();
}

int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
? ST->getMVEVectorCostFactor()
: 1;
Expand Down
30 changes: 15 additions & 15 deletions llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
Expand Up @@ -1284,8 +1284,8 @@ define i32 @load_fpextends() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r2 = fpext half %loadf16 to double
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r3 = fpext float %loadf32 to double
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5 = fpext <2 x half> %loadv2f16 to <2 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v6 = fpext <4 x half> %loadv4f16 to <4 x double>
Expand All @@ -1294,8 +1294,8 @@ define i32 @load_fpextends() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v9 = fpext <2 x float> %loadv2f32 to <2 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v10 = fpext <4 x float> %loadv4f32 to <4 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v11 = fpext <8 x float> %loadv8f32 to <8 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-V8M-MAIN-RECIP-LABEL: 'load_fpextends'
Expand Down Expand Up @@ -1396,8 +1396,8 @@ define i32 @load_fpextends() {
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r2 = fpext half %loadf16 to double
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r3 = fpext float %loadf32 to double
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5 = fpext <2 x half> %loadv2f16 to <2 x double>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v6 = fpext <4 x half> %loadv4f16 to <4 x double>
Expand All @@ -1407,7 +1407,7 @@ define i32 @load_fpextends() {
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v10 = fpext <4 x float> %loadv4f32 to <4 x double>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v11 = fpext <8 x float> %loadv8f32 to <8 x double>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-V8M-MAIN-SIZE-LABEL: 'load_fpextends'
Expand Down Expand Up @@ -1558,9 +1558,9 @@ define i32 @load_fptrunc() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %i3264 = fptrunc double undef to float
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632 = fptrunc <2 x float> undef to <2 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v21664 = fptrunc <2 x double> undef to <2 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v41664 = fptrunc <4 x double> undef to <4 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v81664 = fptrunc <8 x double> undef to <8 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v23264 = fptrunc <2 x double> undef to <2 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v43264 = fptrunc <4 x double> undef to <4 x float>
Expand All @@ -1569,7 +1569,7 @@ define i32 @load_fptrunc() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float %i3264, float* undef, align 4
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x half> %v21632, <2 x half>* undef, align 4
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x half> %v21664, <2 x half>* undef, align 4
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: store <4 x half> %v41632, <4 x half>* undef, align 8
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x half> %v41632, <4 x half>* undef, align 8
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: store <4 x half> %v41664, <4 x half>* undef, align 8
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x half> %v81632, <8 x half>* undef, align 16
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x half> %v81664, <8 x half>* undef, align 16
Expand Down Expand Up @@ -1658,9 +1658,9 @@ define i32 @load_fptrunc() {
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %i3264 = fptrunc double undef to float
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632 = fptrunc <2 x float> undef to <2 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v21664 = fptrunc <2 x double> undef to <2 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v41664 = fptrunc <4 x double> undef to <4 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v81664 = fptrunc <8 x double> undef to <8 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v23264 = fptrunc <2 x double> undef to <2 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v43264 = fptrunc <4 x double> undef to <4 x float>
Expand Down Expand Up @@ -2784,8 +2784,8 @@ define i32 @maskedload_fpextends() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v9 = fpext <2 x float> %loadv2f32 to <2 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v10 = fpext <4 x float> %loadv4f32 to <4 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v11 = fpext <8 x float> %loadv8f32 to <8 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-V8M-MAIN-RECIP-LABEL: 'maskedload_fpextends'
Expand Down Expand Up @@ -2877,7 +2877,7 @@ define i32 @maskedload_fpextends() {
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v10 = fpext <4 x float> %loadv4f32 to <4 x double>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v11 = fpext <8 x float> %loadv8f32 to <8 x double>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-V8M-MAIN-SIZE-LABEL: 'maskedload_fpextends'
Expand Down
Expand Up @@ -562,14 +562,12 @@ for.body:
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

; TODO: this fpext could be allowed, but we don't lower it very efficiently yet,
; so reject this for now.
define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
; CHECK-LABEL: fpext_allowed(
; PREFER-FOLDING-NOT: vector.body:
; PREFER-FOLDING: vector.body:
; PREFER-FOLDING-NOT: llvm.masked.load
; PREFER-FOLDING-NOT: llvm.masked.store
; PREFER-FOLDING-NOT: br i1 %{{.*}}, label %{{.*}}, label %vector.body
; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
entry:
br label %for.body

Expand All @@ -591,14 +589,12 @@ for.body:
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

; TODO: this fptrunc could be allowed, but we don't lower it very efficiently yet,
; so reject this for now.
define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
; CHECK-LABEL: fptrunc_allowed(
; PREFER-FOLDING-NOT: vector.body:
; PREFER-FOLDING: vector.body:
; PREFER-FOLDING-NOT: llvm.masked.load
; PREFER-FOLDING-NOT: llvm.masked.store
; PREFER-FOLDING-NOT: br i1 %{{.*}}, label %{{.*}}, label %vector.body
; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
entry:
br label %for.body

Expand Down

0 comments on commit 146dad0

Please sign in to comment.