diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index fffd696e59baf..df5e17a27d26c 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -220,6 +220,11 @@ Makes programs 10x faster by doing Special New Thing. * `.att_syntax` directive is now emitted for assembly files when AT&T syntax is in use. This matches the behaviour of Intel syntax and aids with compatibility when changing the default Clang syntax to the Intel syntax. +* Masked gather and scatter cost overheads are now per-shape on AMD znver4 + and znver5 targets via a new `TuningPreferAMDZenGSCost` subtarget + feature, replacing the single flat overhead inherited from the generic + AVX-512 path. The per-shape costs use empirical break-even values + measured on Zen 4 / Zen 5 hardware. ### Changes to the OCaml bindings diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 50fb7204ebfa1..28bbd639649bb 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -721,6 +721,17 @@ def TuningFastGather : SubtargetFeature<"fast-gather", "HasFastGather", "true", "Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">; +// Use AMD Zen-tuned cost tables for masked gather/scatter intrinsics in the +// X86 TargetTransformInfo cost model. Refines the flat overhead used by other +// AVX-512 targets with per-element-type/per-VL costs measured on znver4 and +// znver5. Inherited automatically by every znver4+ CPU via ZN4Tuning; not +// applied to pre-AVX-512 Zen parts (znver1..3), which take the scalarise +// path for masked gather anyway. +def TuningPreferAMDZenGSCost + : SubtargetFeature<"prefer-amd-zen-gs-cost", + "HasPreferAMDZenGSCost", "true", + "Use AMD Zen-tuned gather/scatter cost tables in the cost model">; + // Generate vpdpwssd instead of vpmaddwd+vpaddd sequence. def TuningFastDPWSSD : SubtargetFeature< @@ -1631,7 +1642,8 @@ def ProcessorFeatures { list ZN3Features = !listconcat(ZN2Features, ZN3AdditionalFeatures); - list ZN4AdditionalTuning = [TuningFastDPWSSD]; + list ZN4AdditionalTuning = [TuningFastDPWSSD, + TuningPreferAMDZenGSCost]; list ZN4Tuning = !listconcat(ZN3Tuning, ZN4AdditionalTuning); list ZN4AdditionalFeatures = [FeatureAVX512, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 698be1615a04b..edc8e78c7f040 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -6253,20 +6253,79 @@ InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, return TTI::TCC_Free; } -int X86TTIImpl::getGatherOverhead() const { +int X86TTIImpl::getGatherOverhead(Type *SrcVTy) const { // Some CPUs have more overhead for gather. The specified overhead is relative // to the Load operation. "2" is the number provided by Intel architects. This // parameter is used for cost estimation of Gather Op and comparison with // other alternatives. // TODO: Remove the explicit hasAVX512()?, That would mean we would only // enable gather with a -march. + + // AMD znver4+ targets enable per-shape costs measured on the hardware via + // TuningPreferAMDZenGSCost (set in ZN4Tuning). Pre-AVX-512 Zen parts + // (znver1..3) take the scalarise path for masked gather and never reach + // this code, so the table only needs to cover AVX-512 widths. + if (ST->hasPreferAMDZenGSCost() && SrcVTy) { + // Per-shape gather costs for AMD znver4+ targets. + // + // The numbers are the empirical "break-even" (lower-bound) costs + // measured by sweeping a forced gather cost while compiling a + // controlled gather micro-benchmark and observing the point at which + // the LoopVectorizer still chose the gather lowering over the scalar + // fallback. The sweep was run independently for every (data type, + // VF) combination on Genoa / Milan / Turin and re-validated on Zen 5; + // the value tabulated below is the cost at which gather emission + // was the right call for that shape. + // + // i64 entries are intentionally absent: the i64 sweep landed within + // the noise of the generic flat overhead, so those shapes fall + // through to the existing flat cost. + static const CostTblEntry ZenGatherCostTable[] = { + {ISD::LOAD, MVT::v2i32, 20}, {ISD::LOAD, MVT::v4i32, 7}, + {ISD::LOAD, MVT::v8i32, 17}, {ISD::LOAD, MVT::v16i32, 14}, + {ISD::LOAD, MVT::v2f32, 20}, {ISD::LOAD, MVT::v4f32, 7}, + {ISD::LOAD, MVT::v8f32, 17}, {ISD::LOAD, MVT::v16f32, 14}, + {ISD::LOAD, MVT::v2f64, 20}, {ISD::LOAD, MVT::v4f64, 7}, + {ISD::LOAD, MVT::v8f64, 17}, {ISD::LOAD, MVT::v16f64, 14}, + }; + EVT VT = TLI->getValueType(DL, SrcVTy); + if (VT.isSimple()) + if (const auto *E = CostTableLookup(ZenGatherCostTable, ISD::LOAD, + VT.getSimpleVT())) + return E->Cost; + } + if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) return 2; return 1024; } -int X86TTIImpl::getScatterOverhead() const { +int X86TTIImpl::getScatterOverhead(Type *SrcVTy) const { + // AMD znver4+ targets use per-shape scatter costs measured on the hardware + // via TuningPreferAMDZenGSCost (set in ZN4Tuning). Fall through to the + // generic flat overhead for shapes we have not characterised. + if (ST->hasPreferAMDZenGSCost() && ST->hasAVX512() && SrcVTy) { + // Per-shape scatter costs for AMD znver4+ targets, measured with the + // same break-even methodology as the gather table above. i32 / f32 + // and f64 lanes use independent curves because their sweep results + // diverged on Zen hardware. i64 entries and VF=2 entries are + // intentionally absent and fall through to the generic flat overhead. + static const CostTblEntry ZenScatterCostTable[] = { + {ISD::STORE, MVT::v4i32, 12}, {ISD::STORE, MVT::v8i32, 14}, + {ISD::STORE, MVT::v16i32, 6}, + {ISD::STORE, MVT::v4f32, 12}, {ISD::STORE, MVT::v8f32, 14}, + {ISD::STORE, MVT::v16f32, 16}, + {ISD::STORE, MVT::v4f64, 5}, {ISD::STORE, MVT::v8f64, 15}, + {ISD::STORE, MVT::v16f64, 3}, + }; + EVT VT = TLI->getValueType(DL, SrcVTy); + if (VT.isSimple()) + if (const auto *E = CostTableLookup(ZenScatterCostTable, ISD::STORE, + VT.getSimpleVT())) + return E->Cost; + } + if (ST->hasAVX512()) return 2; @@ -6338,8 +6397,9 @@ InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, // The gather / scatter cost is given by Intel architects. It is a rough // number since we are looking at one instruction in a time. - const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead() - : getScatterOverhead(); + const int GSOverhead = (Opcode == Instruction::Load) + ? getGatherOverhead(SrcVTy) + : getScatterOverhead(SrcVTy); return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace, CostKind); } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index ea277bfeab560..ceb6dcc172f94 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -346,8 +346,8 @@ class X86TTIImpl final : public BasicTTIImplBase { Type *DataTy, const Value *Ptr, Align Alignment, unsigned AddressSpace) const; - int getGatherOverhead() const; - int getScatterOverhead() const; + int getGatherOverhead(Type *SrcVTy) const; + int getScatterOverhead(Type *SrcVTy) const; /// @} }; diff --git a/llvm/test/Analysis/CostModel/X86/masked-gather-scatter-amd-zen.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-scatter-amd-zen.ll new file mode 100644 index 0000000000000..1565568d8d010 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/masked-gather-scatter-amd-zen.ll @@ -0,0 +1,553 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; Cost-model coverage for AMD Zen-tuned masked gather/scatter overheads. +; +; ZNVER4 / ZNVER5 enable the per-shape Zen cost tables via +; TuningPreferAMDZenGSCost (set in ZN4Tuning and inherited by ZN5Tuning) and +; have AVX-512, so the new tables are consulted in getGSVectorCost. +; ZNVER3 does NOT carry TuningPreferAMDZenGSCost and lacks both AVX-512 and +; TuningFastGather, so isLegalMaskedGather() returns false and the cost model +; walks the scalarise path (getGSScalarCost). The ZNVER3 numbers below are the +; unchanged scalar fallback cost, included here only to lock in that this +; change does not regress pre-AVX-512 Zen targets. +; SKX is a non-Zen AVX-512 baseline showing the generic flat overhead of 2. +; +; RUN: opt < %s -S -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=throughput -mcpu=znver4 | FileCheck %s --check-prefix=ZNVER4 +; RUN: opt < %s -S -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=throughput -mcpu=znver5 | FileCheck %s --check-prefix=ZNVER5 +; RUN: opt < %s -S -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=throughput -mcpu=znver3 | FileCheck %s --check-prefix=ZNVER3 +; RUN: opt < %s -S -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=throughput -mcpu=skx | FileCheck %s --check-prefix=SKX + +;------------------------------------------------------------------------------ +; Masked gather - i32 element type +;------------------------------------------------------------------------------ + +define <2 x i32> @gather_v2i32(<2 x ptr> %ptrs, <2 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v2i32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x i32> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %v +; +; ZNVER5-LABEL: 'gather_v2i32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x i32> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %v +; +; ZNVER3-LABEL: 'gather_v2i32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x i32> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %v +; +; SKX-LABEL: 'gather_v2i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %v +; + %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x i32> undef) + ret <2 x i32> %v +} + +define <4 x i32> @gather_v4i32(<4 x ptr> %ptrs, <4 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v4i32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v +; +; ZNVER5-LABEL: 'gather_v4i32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v +; +; ZNVER3-LABEL: 'gather_v4i32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v +; +; SKX-LABEL: 'gather_v4i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v +; + %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> undef) + ret <4 x i32> %v +} + +define <8 x i32> @gather_v8i32(<8 x ptr> %ptrs, <8 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v8i32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x i32> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %v +; +; ZNVER5-LABEL: 'gather_v8i32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x i32> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %v +; +; ZNVER3-LABEL: 'gather_v8i32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x i32> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %v +; +; SKX-LABEL: 'gather_v8i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %v +; + %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %mask, <8 x i32> undef) + ret <8 x i32> %v +} + +define <16 x i32> @gather_v16i32(<16 x ptr> %ptrs, <16 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v16i32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x i32> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %v +; +; ZNVER5-LABEL: 'gather_v16i32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x i32> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %v +; +; ZNVER3-LABEL: 'gather_v16i32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %v = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x i32> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %v +; +; SKX-LABEL: 'gather_v16i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %v +; + %v = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> undef) + ret <16 x i32> %v +} + +;------------------------------------------------------------------------------ +; Masked gather - i64 element type +;------------------------------------------------------------------------------ + +define <2 x i64> @gather_v2i64(<2 x ptr> %ptrs, <2 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v2i64' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x i64> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v +; +; ZNVER5-LABEL: 'gather_v2i64' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x i64> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v +; +; ZNVER3-LABEL: 'gather_v2i64' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x i64> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v +; +; SKX-LABEL: 'gather_v2i64' +; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v +; + %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x i64> undef) + ret <2 x i64> %v +} + +define <4 x i64> @gather_v4i64(<4 x ptr> %ptrs, <4 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v4i64' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x i64> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %v +; +; ZNVER5-LABEL: 'gather_v4i64' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x i64> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %v +; +; ZNVER3-LABEL: 'gather_v4i64' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x i64> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %v +; +; SKX-LABEL: 'gather_v4i64' +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %v +; + %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x i64> undef) + ret <4 x i64> %v +} + +define <8 x i64> @gather_v8i64(<8 x ptr> %ptrs, <8 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v8i64' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x i64> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %v +; +; ZNVER5-LABEL: 'gather_v8i64' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x i64> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %v +; +; ZNVER3-LABEL: 'gather_v8i64' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x i64> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %v +; +; SKX-LABEL: 'gather_v8i64' +; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %v +; + %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x i64> undef) + ret <8 x i64> %v +} + +;------------------------------------------------------------------------------ +; Masked gather - f32 element type +;------------------------------------------------------------------------------ + +define <2 x float> @gather_v2f32(<2 x ptr> %ptrs, <2 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v2f32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x float> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %v +; +; ZNVER5-LABEL: 'gather_v2f32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x float> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %v +; +; ZNVER3-LABEL: 'gather_v2f32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x float> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %v +; +; SKX-LABEL: 'gather_v2f32' +; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %v +; + %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x float> undef) + ret <2 x float> %v +} + +define <4 x float> @gather_v4f32(<4 x ptr> %ptrs, <4 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v4f32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x float> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %v +; +; ZNVER5-LABEL: 'gather_v4f32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x float> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %v +; +; ZNVER3-LABEL: 'gather_v4f32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x float> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %v +; +; SKX-LABEL: 'gather_v4f32' +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %v +; + %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x float> undef) + ret <4 x float> %v +} + +define <8 x float> @gather_v8f32(<8 x ptr> %ptrs, <8 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v8f32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x float> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %v +; +; ZNVER5-LABEL: 'gather_v8f32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x float> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %v +; +; ZNVER3-LABEL: 'gather_v8f32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x float> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %v +; +; SKX-LABEL: 'gather_v8f32' +; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %v +; + %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %mask, <8 x float> undef) + ret <8 x float> %v +} + +define <16 x float> @gather_v16f32(<16 x ptr> %ptrs, <16 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v16f32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x float> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %v +; +; ZNVER5-LABEL: 'gather_v16f32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x float> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %v +; +; ZNVER3-LABEL: 'gather_v16f32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x float> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %v +; +; SKX-LABEL: 'gather_v16f32' +; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %v +; + %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float> %v +} + +;------------------------------------------------------------------------------ +; Masked gather - f64 element type +;------------------------------------------------------------------------------ + +define <2 x double> @gather_v2f64(<2 x ptr> %ptrs, <2 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v2f64' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x double> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %v +; +; ZNVER5-LABEL: 'gather_v2f64' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x double> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %v +; +; ZNVER3-LABEL: 'gather_v2f64' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x double> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %v +; +; SKX-LABEL: 'gather_v2f64' +; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %v +; + %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x double> undef) + ret <2 x double> %v +} + +define <4 x double> @gather_v4f64(<4 x ptr> %ptrs, <4 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v4f64' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x double> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %v +; +; ZNVER5-LABEL: 'gather_v4f64' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x double> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %v +; +; ZNVER3-LABEL: 'gather_v4f64' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x double> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %v +; +; SKX-LABEL: 'gather_v4f64' +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %v +; + %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x double> undef) + ret <4 x double> %v +} + +define <8 x double> @gather_v8f64(<8 x ptr> %ptrs, <8 x i1> %mask) { +; ZNVER4-LABEL: 'gather_v8f64' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x double> undef) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %v +; +; ZNVER5-LABEL: 'gather_v8f64' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x double> undef) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %v +; +; ZNVER3-LABEL: 'gather_v8f64' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x double> undef) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %v +; +; SKX-LABEL: 'gather_v8f64' +; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %v +; + %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x double> undef) + ret <8 x double> %v +} + +;------------------------------------------------------------------------------ +; Masked scatter - i32 element type +;------------------------------------------------------------------------------ + +define void @scatter_v4i32(<4 x i32> %src, <4 x ptr> %ptrs, <4 x i1> %mask) { +; ZNVER4-LABEL: 'scatter_v4i32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER5-LABEL: 'scatter_v4i32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER3-LABEL: 'scatter_v4i32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'scatter_v4i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %src, <4 x ptr> %ptrs, i32 4, <4 x i1> %mask) + ret void +} + +define void @scatter_v8i32(<8 x i32> %src, <8 x ptr> %ptrs, <8 x i1> %mask) { +; ZNVER4-LABEL: 'scatter_v8i32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER5-LABEL: 'scatter_v8i32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER3-LABEL: 'scatter_v8i32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'scatter_v8i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %src, <8 x ptr> %ptrs, i32 4, <8 x i1> %mask) + ret void +} + +define void @scatter_v16i32(<16 x i32> %src, <16 x ptr> %ptrs, <16 x i1> %mask) { +; ZNVER4-LABEL: 'scatter_v16i32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER5-LABEL: 'scatter_v16i32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER3-LABEL: 'scatter_v16i32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'scatter_v16i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src, <16 x ptr> %ptrs, i32 4, <16 x i1> %mask) + ret void +} + +;------------------------------------------------------------------------------ +; Masked scatter - i64 element type +;------------------------------------------------------------------------------ + +define void @scatter_v4i64(<4 x i64> %src, <4 x ptr> %ptrs, <4 x i1> %mask) { +; ZNVER4-LABEL: 'scatter_v4i64' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER5-LABEL: 'scatter_v4i64' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER3-LABEL: 'scatter_v4i64' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'scatter_v4i64' +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %src, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask) + ret void +} + +define void @scatter_v8i64(<8 x i64> %src, <8 x ptr> %ptrs, <8 x i1> %mask) { +; ZNVER4-LABEL: 'scatter_v8i64' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER5-LABEL: 'scatter_v8i64' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER3-LABEL: 'scatter_v8i64' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'scatter_v8i64' +; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %src, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask) + ret void +} + +;------------------------------------------------------------------------------ +; Masked scatter - f32 element type +;------------------------------------------------------------------------------ + +define void @scatter_v4f32(<4 x float> %src, <4 x ptr> %ptrs, <4 x i1> %mask) { +; ZNVER4-LABEL: 'scatter_v4f32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER5-LABEL: 'scatter_v4f32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER3-LABEL: 'scatter_v4f32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'scatter_v4f32' +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %src, <4 x ptr> %ptrs, i32 4, <4 x i1> %mask) + ret void +} + +define void @scatter_v8f32(<8 x float> %src, <8 x ptr> %ptrs, <8 x i1> %mask) { +; ZNVER4-LABEL: 'scatter_v8f32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER5-LABEL: 'scatter_v8f32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER3-LABEL: 'scatter_v8f32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'scatter_v8f32' +; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %src, <8 x ptr> %ptrs, i32 4, <8 x i1> %mask) + ret void +} + +define void @scatter_v16f32(<16 x float> %src, <16 x ptr> %ptrs, <16 x i1> %mask) { +; ZNVER4-LABEL: 'scatter_v16f32' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER5-LABEL: 'scatter_v16f32' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER3-LABEL: 'scatter_v16f32' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'scatter_v16f32' +; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src, <16 x ptr> %ptrs, i32 4, <16 x i1> %mask) + ret void +} + +;------------------------------------------------------------------------------ +; Masked scatter - f64 element type +;------------------------------------------------------------------------------ + +define void @scatter_v4f64(<4 x double> %src, <4 x ptr> %ptrs, <4 x i1> %mask) { +; ZNVER4-LABEL: 'scatter_v4f64' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER5-LABEL: 'scatter_v4f64' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER3-LABEL: 'scatter_v4f64' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'scatter_v4f64' +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %src, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask) + ret void +} + +define void @scatter_v8f64(<8 x double> %src, <8 x ptr> %ptrs, <8 x i1> %mask) { +; ZNVER4-LABEL: 'scatter_v8f64' +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask) +; ZNVER4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER5-LABEL: 'scatter_v8f64' +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask) +; ZNVER5-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZNVER3-LABEL: 'scatter_v8f64' +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask) +; ZNVER3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'scatter_v8f64' +; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> %src, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask) + ret void +}