diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index fffd696e59baf..df5e17a27d26c 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -220,6 +220,11 @@ Makes programs 10x faster by doing Special New Thing.
 * `.att_syntax` directive is now emitted for assembly files when AT&T syntax is
   in use. This matches the behaviour of Intel syntax and aids with
   compatibility when changing the default Clang syntax to the Intel syntax.
+* Masked gather and scatter cost overheads are now per-shape on AMD znver4
+  and znver5 targets via a new `TuningPreferAMDZenGSCost` subtarget
+  feature, replacing the single flat overhead inherited from the generic
+  AVX-512 path. The per-shape costs use empirical break-even values
+  measured on Zen 4 / Zen 5 hardware.
 
 ### Changes to the OCaml bindings
 
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 50fb7204ebfa1..28bbd639649bb 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -721,6 +721,17 @@ def TuningFastGather
     : SubtargetFeature<"fast-gather", "HasFastGather", "true",
                        "Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
 
+// Use AMD Zen-tuned cost tables for masked gather/scatter intrinsics in the
+// X86 TargetTransformInfo cost model. Refines the flat overhead used by other
+// AVX-512 targets with per-element-type/per-VL costs measured on znver4 and
+// znver5. Inherited automatically by every znver4+ CPU via ZN4Tuning; not
+// applied to pre-AVX-512 Zen parts (znver1..3), which take the scalarise
+// path for masked gather anyway.
+def TuningPreferAMDZenGSCost
+    : SubtargetFeature<"prefer-amd-zen-gs-cost",
+                       "HasPreferAMDZenGSCost", "true",
+                       "Use AMD Zen-tuned gather/scatter cost tables in the cost model">;
+
 // Generate vpdpwssd instead of vpmaddwd+vpaddd sequence.
 def TuningFastDPWSSD
     : SubtargetFeature<
@@ -1631,7 +1642,8 @@ def ProcessorFeatures {
   list<SubtargetFeature> ZN3Features =
     !listconcat(ZN2Features, ZN3AdditionalFeatures);
 
-  list<SubtargetFeature> ZN4AdditionalTuning = [TuningFastDPWSSD];
+  list<SubtargetFeature> ZN4AdditionalTuning = [TuningFastDPWSSD,
+                                                TuningPreferAMDZenGSCost];
   list<SubtargetFeature> ZN4Tuning =
     !listconcat(ZN3Tuning, ZN4AdditionalTuning);
   list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 698be1615a04b..edc8e78c7f040 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6253,20 +6253,79 @@ InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode,
   return TTI::TCC_Free;
 }
 
-int X86TTIImpl::getGatherOverhead() const {
+int X86TTIImpl::getGatherOverhead(Type *SrcVTy) const {
   // Some CPUs have more overhead for gather. The specified overhead is relative
   // to the Load operation. "2" is the number provided by Intel architects. This
   // parameter is used for cost estimation of Gather Op and comparison with
   // other alternatives.
   // TODO: Remove the explicit hasAVX512()?, That would mean we would only
   // enable gather with a -march.
+
+  // AMD znver4+ targets enable per-shape costs measured on the hardware via
+  // TuningPreferAMDZenGSCost (set in ZN4Tuning). Pre-AVX-512 Zen parts
+  // (znver1..3) take the scalarise path for masked gather and never reach
+  // this code, so the table only needs to cover AVX-512 widths.
+  if (ST->hasPreferAMDZenGSCost() && SrcVTy) {
+    // Per-shape gather costs for AMD znver4+ targets.
+    //
+    // The numbers are the empirical "break-even" (lower-bound) costs
+    // measured by sweeping a forced gather cost while compiling a
+    // controlled gather micro-benchmark and observing the point at which
+    // the LoopVectorizer still chose the gather lowering over the scalar
+    // fallback. The sweep was run independently for every (data type,
+    // VF) combination on Genoa / Milan / Turin and re-validated on Zen 5;
+    // the value tabulated below is the cost at which gather emission
+    // was the right call for that shape.
+    //
+    // i64 entries are intentionally absent: the i64 sweep landed within
+    // the noise of the generic flat overhead, so those shapes fall
+    // through to the existing flat cost.
+    static const CostTblEntry ZenGatherCostTable[] = {
+        {ISD::LOAD, MVT::v2i32, 20}, {ISD::LOAD, MVT::v4i32,  7},
+        {ISD::LOAD, MVT::v8i32, 17}, {ISD::LOAD, MVT::v16i32, 14},
+        {ISD::LOAD, MVT::v2f32, 20}, {ISD::LOAD, MVT::v4f32,  7},
+        {ISD::LOAD, MVT::v8f32, 17}, {ISD::LOAD, MVT::v16f32, 14},
+        {ISD::LOAD, MVT::v2f64, 20}, {ISD::LOAD, MVT::v4f64,  7},
+        {ISD::LOAD, MVT::v8f64, 17}, {ISD::LOAD, MVT::v16f64, 14},
+    };
+    EVT VT = TLI->getValueType(DL, SrcVTy);
+    if (VT.isSimple())
+      if (const auto *E = CostTableLookup(ZenGatherCostTable, ISD::LOAD,
+                                          VT.getSimpleVT()))
+        return E->Cost;
+  }
+
   if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
     return 2;
 
   return 1024;
 }
 
-int X86TTIImpl::getScatterOverhead() const {
+int X86TTIImpl::getScatterOverhead(Type *SrcVTy) const {
+  // AMD znver4+ targets use per-shape scatter costs measured on the hardware
+  // via TuningPreferAMDZenGSCost (set in ZN4Tuning). Fall through to the
+  // generic flat overhead for shapes we have not characterised.
+  if (ST->hasPreferAMDZenGSCost() && ST->hasAVX512() && SrcVTy) {
+    // Per-shape scatter costs for AMD znver4+ targets, measured with the
+    // same break-even methodology as the gather table above. i32 / f32
+    // and f64 lanes use independent curves because their sweep results
+    // diverged on Zen hardware. i64 entries and VF=2 entries are
+    // intentionally absent and fall through to the generic flat overhead.
+    static const CostTblEntry ZenScatterCostTable[] = {
+        {ISD::STORE, MVT::v4i32, 12}, {ISD::STORE, MVT::v8i32, 14},
+        {ISD::STORE, MVT::v16i32, 6},
+        {ISD::STORE, MVT::v4f32, 12}, {ISD::STORE, MVT::v8f32, 14},
+        {ISD::STORE, MVT::v16f32, 16},
+        {ISD::STORE, MVT::v4f64,  5}, {ISD::STORE, MVT::v8f64, 15},
+        {ISD::STORE, MVT::v16f64, 3},
+    };
+    EVT VT = TLI->getValueType(DL, SrcVTy);
+    if (VT.isSimple())
+      if (const auto *E = CostTableLookup(ZenScatterCostTable, ISD::STORE,
+                                          VT.getSimpleVT()))
+        return E->Cost;
+  }
+
   if (ST->hasAVX512())
     return 2;
 
@@ -6338,8 +6397,9 @@ InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
 
   // The gather / scatter cost is given by Intel architects. It is a rough
   // number since we are looking at one instruction in a time.
-  const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
-                                                       : getScatterOverhead();
+  const int GSOverhead = (Opcode == Instruction::Load)
+                             ? getGatherOverhead(SrcVTy)
+                             : getScatterOverhead(SrcVTy);
   return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
                                            Alignment, AddressSpace, CostKind);
 }
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index ea277bfeab560..ceb6dcc172f94 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -346,8 +346,8 @@ class X86TTIImpl final : public BasicTTIImplBase<X86TTIImpl> {
                                   Type *DataTy, const Value *Ptr,
                                   Align Alignment, unsigned AddressSpace) const;
 
-  int getGatherOverhead() const;
-  int getScatterOverhead() const;
+  int getGatherOverhead(Type *SrcVTy) const;
+  int getScatterOverhead(Type *SrcVTy) const;
 
   /// @}
 };
diff --git a/llvm/test/Analysis/CostModel/X86/masked-gather-scatter-amd-zen.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-scatter-amd-zen.ll
new file mode 100644
index 0000000000000..1565568d8d010
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-gather-scatter-amd-zen.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; Cost-model coverage for AMD Zen-tuned masked gather/scatter overheads.
+;
+; ZNVER4 / ZNVER5 enable the per-shape Zen cost tables via
+; TuningPreferAMDZenGSCost (set in ZN4Tuning and inherited by ZN5Tuning) and
+; have AVX-512, so the new tables are consulted in getGSVectorCost.
+; ZNVER3 does NOT carry TuningPreferAMDZenGSCost and lacks both AVX-512 and
+; TuningFastGather, so isLegalMaskedGather() returns false and the cost model
+; walks the scalarise path (getGSScalarCost). The ZNVER3 numbers below are the
+; unchanged scalar fallback cost, included here only to lock in that this
+; change does not regress pre-AVX-512 Zen targets.
+; SKX is a non-Zen AVX-512 baseline showing the generic flat overhead of 2.
+;
+; RUN: opt < %s -S -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mcpu=znver4 | FileCheck %s --check-prefix=ZNVER4
+; RUN: opt < %s -S -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mcpu=znver5 | FileCheck %s --check-prefix=ZNVER5
+; RUN: opt < %s -S -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mcpu=znver3 | FileCheck %s --check-prefix=ZNVER3
+; RUN: opt < %s -S -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mcpu=skx    | FileCheck %s --check-prefix=SKX
+
+;------------------------------------------------------------------------------
+; Masked gather - i32 element type
+;------------------------------------------------------------------------------
+
+define <2 x i32> @gather_v2i32(<2 x ptr> %ptrs, <2 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v2i32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x i32> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %v
+;
+; ZNVER5-LABEL: 'gather_v2i32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x i32> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %v
+;
+; ZNVER3-LABEL: 'gather_v2i32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x i32> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %v
+;
+; SKX-LABEL: 'gather_v2i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %v
+;
+  %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x i32> undef)
+  ret <2 x i32> %v
+}
+
+define <4 x i32> @gather_v4i32(<4 x ptr> %ptrs, <4 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v4i32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v
+;
+; ZNVER5-LABEL: 'gather_v4i32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v
+;
+; ZNVER3-LABEL: 'gather_v4i32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v
+;
+; SKX-LABEL: 'gather_v4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v
+;
+  %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> undef)
+  ret <4 x i32> %v
+}
+
+define <8 x i32> @gather_v8i32(<8 x ptr> %ptrs, <8 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v8i32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x i32> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %v
+;
+; ZNVER5-LABEL: 'gather_v8i32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x i32> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %v
+;
+; ZNVER3-LABEL: 'gather_v8i32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x i32> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %v
+;
+; SKX-LABEL: 'gather_v8i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %v
+;
+  %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %mask, <8 x i32> undef)
+  ret <8 x i32> %v
+}
+
+define <16 x i32> @gather_v16i32(<16 x ptr> %ptrs, <16 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v16i32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x i32> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %v
+;
+; ZNVER5-LABEL: 'gather_v16i32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x i32> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %v
+;
+; ZNVER3-LABEL: 'gather_v16i32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %v = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x i32> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %v
+;
+; SKX-LABEL: 'gather_v16i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %v
+;
+  %v = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> undef)
+  ret <16 x i32> %v
+}
+
+;------------------------------------------------------------------------------
+; Masked gather - i64 element type
+;------------------------------------------------------------------------------
+
+define <2 x i64> @gather_v2i64(<2 x ptr> %ptrs, <2 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v2i64'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x i64> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v
+;
+; ZNVER5-LABEL: 'gather_v2i64'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x i64> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v
+;
+; ZNVER3-LABEL: 'gather_v2i64'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x i64> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v
+;
+; SKX-LABEL: 'gather_v2i64'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v
+;
+  %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x i64> undef)
+  ret <2 x i64> %v
+}
+
+define <4 x i64> @gather_v4i64(<4 x ptr> %ptrs, <4 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v4i64'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x i64> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %v
+;
+; ZNVER5-LABEL: 'gather_v4i64'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x i64> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %v
+;
+; ZNVER3-LABEL: 'gather_v4i64'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x i64> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %v
+;
+; SKX-LABEL: 'gather_v4i64'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %v
+;
+  %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x i64> undef)
+  ret <4 x i64> %v
+}
+
+define <8 x i64> @gather_v8i64(<8 x ptr> %ptrs, <8 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v8i64'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x i64> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %v
+;
+; ZNVER5-LABEL: 'gather_v8i64'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x i64> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %v
+;
+; ZNVER3-LABEL: 'gather_v8i64'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x i64> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %v
+;
+; SKX-LABEL: 'gather_v8i64'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %v
+;
+  %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x i64> undef)
+  ret <8 x i64> %v
+}
+
+;------------------------------------------------------------------------------
+; Masked gather - f32 element type
+;------------------------------------------------------------------------------
+
+define <2 x float> @gather_v2f32(<2 x ptr> %ptrs, <2 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v2f32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x float> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %v
+;
+; ZNVER5-LABEL: 'gather_v2f32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x float> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %v
+;
+; ZNVER3-LABEL: 'gather_v2f32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x float> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %v
+;
+; SKX-LABEL: 'gather_v2f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %v
+;
+  %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x float> undef)
+  ret <2 x float> %v
+}
+
+define <4 x float> @gather_v4f32(<4 x ptr> %ptrs, <4 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v4f32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x float> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %v
+;
+; ZNVER5-LABEL: 'gather_v4f32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x float> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %v
+;
+; ZNVER3-LABEL: 'gather_v4f32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x float> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %v
+;
+; SKX-LABEL: 'gather_v4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %v
+;
+  %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float> %v
+}
+
+define <8 x float> @gather_v8f32(<8 x ptr> %ptrs, <8 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v8f32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x float> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %v
+;
+; ZNVER5-LABEL: 'gather_v8f32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x float> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %v
+;
+; ZNVER3-LABEL: 'gather_v8f32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x float> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %v
+;
+; SKX-LABEL: 'gather_v8f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 4 %ptrs, <8 x i1> %mask, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %v
+;
+  %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %mask, <8 x float> undef)
+  ret <8 x float> %v
+}
+
+define <16 x float> @gather_v16f32(<16 x ptr> %ptrs, <16 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v16f32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x float> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %v
+;
+; ZNVER5-LABEL: 'gather_v16f32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x float> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %v
+;
+; ZNVER3-LABEL: 'gather_v16f32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x float> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %v
+;
+; SKX-LABEL: 'gather_v16f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %v
+;
+  %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float> %v
+}
+
+;------------------------------------------------------------------------------
+; Masked gather - f64 element type
+;------------------------------------------------------------------------------
+
+define <2 x double> @gather_v2f64(<2 x ptr> %ptrs, <2 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v2f64'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x double> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %v
+;
+; ZNVER5-LABEL: 'gather_v2f64'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x double> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %v
+;
+; ZNVER3-LABEL: 'gather_v2f64'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x double> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %v
+;
+; SKX-LABEL: 'gather_v2f64'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 %ptrs, <2 x i1> %mask, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %v
+;
+  %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x double> undef)
+  ret <2 x double> %v
+}
+
+define <4 x double> @gather_v4f64(<4 x ptr> %ptrs, <4 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v4f64'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x double> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %v
+;
+; ZNVER5-LABEL: 'gather_v4f64'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x double> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %v
+;
+; ZNVER3-LABEL: 'gather_v4f64'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x double> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %v
+;
+; SKX-LABEL: 'gather_v4f64'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> %mask, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %v
+;
+  %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x double> undef)
+  ret <4 x double> %v
+}
+
+define <8 x double> @gather_v8f64(<8 x ptr> %ptrs, <8 x i1> %mask) {
+; ZNVER4-LABEL: 'gather_v8f64'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x double> undef)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %v
+;
+; ZNVER5-LABEL: 'gather_v8f64'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x double> undef)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %v
+;
+; ZNVER3-LABEL: 'gather_v8f64'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x double> undef)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %v
+;
+; SKX-LABEL: 'gather_v8f64'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %v
+;
+  %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x double> undef)
+  ret <8 x double> %v
+}
+
+;------------------------------------------------------------------------------
+; Masked scatter - i32 element type
+;------------------------------------------------------------------------------
+
+define void @scatter_v4i32(<4 x i32> %src, <4 x ptr> %ptrs, <4 x i1> %mask) {
+; ZNVER4-LABEL: 'scatter_v4i32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER5-LABEL: 'scatter_v4i32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER3-LABEL: 'scatter_v4i32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'scatter_v4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %src, <4 x ptr> %ptrs, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define void @scatter_v8i32(<8 x i32> %src, <8 x ptr> %ptrs, <8 x i1> %mask) {
+; ZNVER4-LABEL: 'scatter_v8i32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER5-LABEL: 'scatter_v8i32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER3-LABEL: 'scatter_v8i32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'scatter_v8i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %src, <8 x ptr> %ptrs, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @scatter_v16i32(<16 x i32> %src, <16 x ptr> %ptrs, <16 x i1> %mask) {
+; ZNVER4-LABEL: 'scatter_v16i32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER5-LABEL: 'scatter_v16i32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER3-LABEL: 'scatter_v16i32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'scatter_v16i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src, <16 x ptr> %ptrs, i32 4, <16 x i1> %mask)
+  ret void
+}
+
+;------------------------------------------------------------------------------
+; Masked scatter - i64 element type
+;------------------------------------------------------------------------------
+
+define void @scatter_v4i64(<4 x i64> %src, <4 x ptr> %ptrs, <4 x i1> %mask) {
+; ZNVER4-LABEL: 'scatter_v4i64'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER5-LABEL: 'scatter_v4i64'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER3-LABEL: 'scatter_v4i64'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'scatter_v4i64'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %src, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
+  ret void
+}
+
+define void @scatter_v8i64(<8 x i64> %src, <8 x ptr> %ptrs, <8 x i1> %mask) {
+; ZNVER4-LABEL: 'scatter_v8i64'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER5-LABEL: 'scatter_v8i64'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER3-LABEL: 'scatter_v8i64'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'scatter_v8i64'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %src, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
+  ret void
+}
+
+;------------------------------------------------------------------------------
+; Masked scatter - f32 element type
+;------------------------------------------------------------------------------
+
+define void @scatter_v4f32(<4 x float> %src, <4 x ptr> %ptrs, <4 x i1> %mask) {
+; ZNVER4-LABEL: 'scatter_v4f32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER5-LABEL: 'scatter_v4f32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER3-LABEL: 'scatter_v4f32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'scatter_v4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %src, <4 x ptr> align 4 %ptrs, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %src, <4 x ptr> %ptrs, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define void @scatter_v8f32(<8 x float> %src, <8 x ptr> %ptrs, <8 x i1> %mask) {
+; ZNVER4-LABEL: 'scatter_v8f32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER5-LABEL: 'scatter_v8f32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER3-LABEL: 'scatter_v8f32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'scatter_v8f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %src, <8 x ptr> align 4 %ptrs, <8 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %src, <8 x ptr> %ptrs, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @scatter_v16f32(<16 x float> %src, <16 x ptr> %ptrs, <16 x i1> %mask) {
+; ZNVER4-LABEL: 'scatter_v16f32'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER5-LABEL: 'scatter_v16f32'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER3-LABEL: 'scatter_v16f32'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'scatter_v16f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src, <16 x ptr> align 4 %ptrs, <16 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src, <16 x ptr> %ptrs, i32 4, <16 x i1> %mask)
+  ret void
+}
+
+;------------------------------------------------------------------------------
+; Masked scatter - f64 element type
+;------------------------------------------------------------------------------
+
+define void @scatter_v4f64(<4 x double> %src, <4 x ptr> %ptrs, <4 x i1> %mask) {
+; ZNVER4-LABEL: 'scatter_v4f64'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER5-LABEL: 'scatter_v4f64'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER3-LABEL: 'scatter_v4f64'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'scatter_v4f64'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %src, <4 x ptr> align 8 %ptrs, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %src, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
+  ret void
+}
+
+define void @scatter_v8f64(<8 x double> %src, <8 x ptr> %ptrs, <8 x i1> %mask) {
+; ZNVER4-LABEL: 'scatter_v8f64'
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask)
+; ZNVER4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER5-LABEL: 'scatter_v8f64'
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask)
+; ZNVER5-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZNVER3-LABEL: 'scatter_v8f64'
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask)
+; ZNVER3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SKX-LABEL: 'scatter_v8f64'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> %src, <8 x ptr> align 8 %ptrs, <8 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> %src, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
+  ret void
+}