diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1d2d49b287d43..bb15e962e73d0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2124,13 +2124,20 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
       { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
       // PermuteSingleSrc shuffle kinds.
-      // TODO: handle vXi8/vXi16.
       { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
       { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
       { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
       { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
       { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
       { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
+      { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case.
+      { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case.
+      { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case.
+      { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl
+      { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl
+      { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl
+      { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl
+      { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl
       // Reverse can be lowered with `rev`.
       { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
       { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
index 07f5b7fa0d54e..defbe964b2afc 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
@@ -12,7 +12,7 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 181 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 362 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
index a8c75a9c92221..0f1bddd7325cd 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
@@ -4,16 +4,16 @@
 define void @reduce_umin() {
 ; CHECK-LABEL: 'reduce_umin'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.umin.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V3i8 = call i8 @llvm.vector.reduce.umin.v3i8(<3 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4i8 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8i8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 608 for instruction: %V16i8 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 760 for instruction: %V32i8 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1064 for instruction: %V64i8 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.umin.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i8 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V8i8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 280 for instruction: %V16i8 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i8 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V64i8 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2i16 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4i16 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8i16 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16i16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i16 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V8i16 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %V16i16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2i32 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i32 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i32 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
@@ -43,16 +43,16 @@ define void @reduce_umin() {
 define void @reduce_umax() {
 ; CHECK-LABEL: 'reduce_umax'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V3i8 = call i8 @llvm.vector.reduce.umax.v3i8(<3 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4i8 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8i8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 608 for instruction: %V16i8 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 760 for instruction: %V32i8 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1064 for instruction: %V64i8 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.umax.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i8 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V8i8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 280 for instruction: %V16i8 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i8 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V64i8 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2i16 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4i16 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8i16 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16i16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i16 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V8i16 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %V16i16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2i32 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i32 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i32 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
@@ -82,16 +82,16 @@ define void @reduce_umax() {
 define void @reduce_smin() {
 ; CHECK-LABEL: 'reduce_smin'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.smin.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V3i8 = call i8 @llvm.vector.reduce.smin.v3i8(<3 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4i8 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8i8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 608 for instruction: %V16i8 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 760 for instruction: %V32i8 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1064 for instruction: %V64i8 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.smin.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i8 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V8i8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 280 for instruction: %V16i8 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i8 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V64i8 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2i16 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4i16 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8i16 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16i16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i16 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V8i16 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %V16i16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2i32 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i32 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i32 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
@@ -121,16 +121,16 @@ define void @reduce_smin() {
 define void @reduce_smax() {
 ; CHECK-LABEL: 'reduce_smax'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.smax.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V3i8 = call i8 @llvm.vector.reduce.smax.v3i8(<3 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4i8 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8i8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 608 for instruction: %V16i8 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 760 for instruction: %V32i8 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1064 for instruction: %V64i8 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.smax.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i8 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V8i8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 280 for instruction: %V16i8 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V32i8 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V64i8 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2i16 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4i16 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8i16 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16i16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i16 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V8i16 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %V16i16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2i32 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4i32 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8i32 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
@@ -159,10 +159,10 @@ define void @reduce_smax() {
 
 define void @reduce_fmin() {
 ; CHECK-LABEL: 'reduce_fmin'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 300 for instruction: %V8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 400 for instruction: %V16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %V16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2f32 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
@@ -184,10 +184,10 @@ define void @reduce_fmin() {
 
 define void @reduce_fmax() {
 ; CHECK-LABEL: 'reduce_fmax'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 300 for instruction: %V8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 400 for instruction: %V16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %V16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2f32 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
index 1492fa4364bc6..e30b80414c691 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
@@ -12,7 +12,7 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 181 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 362 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
index ad8fdb5e0a736..bc5033388dffa 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
@@ -5,14 +5,14 @@ define void @reduce() {
 ; CHECK-LABEL: 'reduce'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 455 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 637 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1001 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 309 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 673 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
index 646406ed5114e..854ee4fdfcc2d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
@@ -4,12 +4,12 @@
 define void @shuffle() {
 ; CHECK-LABEL: 'shuffle'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %v9 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %v10 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v10 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %v13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll b/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll
index 4aef33c632dd4..ce743526ee565 100644
--- a/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll
@@ -48,7 +48,7 @@ define i32 @add.i32.v4i32(<4 x i32> %v) {
 
 define i8 @umin.i8.v8i8(<8 x i8> %v) {
 ; COST-LABEL: 'umin.i8.v8i8'
-; COST-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
 ;
   %r = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v)
@@ -57,7 +57,7 @@ define i8 @umin.i8.v8i8(<8 x i8> %v) {
 
 define i8 @umin.i8.v16i8(<16 x i8> %v) {
 ; COST-LABEL: 'umin.i8.v16i8'
-; COST-NEXT:  Cost Model: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 280 for instruction: %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
 ;
   %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v)
@@ -66,7 +66,7 @@ define i8 @umin.i8.v16i8(<16 x i8> %v) {
 
 define i16 @umin.i16.v4i16(<4 x i16> %v) {
 ; COST-LABEL: 'umin.i16.v4i16'
-; COST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %r = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %r = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v)
@@ -75,7 +75,7 @@ define i16 @umin.i16.v4i16(<4 x i16> %v) {
 
 define i16 @umin.i16.v8i16(<8 x i16> %v) {
 ; COST-LABEL: 'umin.i16.v8i16'
-; COST-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v)
@@ -93,7 +93,7 @@ define i32 @umin.i32.v4i32(<4 x i32> %v) {
 
 define i8 @umax.i8.v8i8(<8 x i8> %v) {
 ; COST-LABEL: 'umax.i8.v8i8'
-; COST-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
 ;
   %r = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
@@ -102,7 +102,7 @@ define i8 @umax.i8.v8i8(<8 x i8> %v) {
 
 define i8 @umax.i8.v16i8(<16 x i8> %v) {
 ; COST-LABEL: 'umax.i8.v16i8'
-; COST-NEXT:  Cost Model: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 280 for instruction: %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
 ;
   %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
@@ -111,7 +111,7 @@ define i8 @umax.i8.v16i8(<16 x i8> %v) {
 
 define i16 @umax.i16.v4i16(<4 x i16> %v) {
 ; COST-LABEL: 'umax.i16.v4i16'
-; COST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %r = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %r = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v)
@@ -120,7 +120,7 @@ define i16 @umax.i16.v4i16(<4 x i16> %v) {
 
 define i16 @umax.i16.v8i16(<8 x i16> %v) {
 ; COST-LABEL: 'umax.i16.v8i16'
-; COST-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v)
@@ -138,7 +138,7 @@ define i32 @umax.i32.v4i32(<4 x i32> %v) {
 
 define i8 @smin.i8.v8i8(<8 x i8> %v) {
 ; COST-LABEL: 'smin.i8.v8i8'
-; COST-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
 ;
   %r = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
@@ -147,7 +147,7 @@ define i8 @smin.i8.v8i8(<8 x i8> %v) {
 
 define i8 @smin.i8.v16i8(<16 x i8> %v) {
 ; COST-LABEL: 'smin.i8.v16i8'
-; COST-NEXT:  Cost Model: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 280 for instruction: %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
 ;
   %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
@@ -156,7 +156,7 @@ define i8 @smin.i8.v16i8(<16 x i8> %v) {
 
 define i16 @smin.i16.v4i16(<4 x i16> %v) {
 ; COST-LABEL: 'smin.i16.v4i16'
-; COST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %r = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %r = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v)
@@ -165,7 +165,7 @@ define i16 @smin.i16.v4i16(<4 x i16> %v) {
 
 define i16 @smin.i16.v8i16(<8 x i16> %v) {
 ; COST-LABEL: 'smin.i16.v8i16'
-; COST-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v)
@@ -183,7 +183,7 @@ define i32 @smin.i32.v4i32(<4 x i32> %v) {
 
 define i8 @smax.i8.v8i8(<8 x i8> %v) {
 ; COST-LABEL: 'smax.i8.v8i8'
-; COST-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
 ;
   %r = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
@@ -192,7 +192,7 @@ define i8 @smax.i8.v8i8(<8 x i8> %v) {
 
 define i8 @smax.i8.v16i8(<16 x i8> %v) {
 ; COST-LABEL: 'smax.i8.v16i8'
-; COST-NEXT:  Cost Model: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 280 for instruction: %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
 ;
   %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
@@ -201,7 +201,7 @@ define i8 @smax.i8.v16i8(<16 x i8> %v) {
 
 define i16 @smax.i16.v4i16(<4 x i16> %v) {
 ; COST-LABEL: 'smax.i16.v4i16'
-; COST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %r = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %r = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v)
@@ -210,7 +210,7 @@ define i16 @smax.i16.v4i16(<4 x i16> %v) {
 
 define i16 @smax.i16.v8i16(<8 x i16> %v) {
 ; COST-LABEL: 'smax.i16.v8i16'
-; COST-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v)
+; COST-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v)
 ; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v)