[NFC][X86][Costmodel] Add some more interleaved load/store test with …

…i16 element type Not sure if even larger interleaving factors are needed, but these are what i have seen being queried in the wild.
llvm · May 26, 2021 · 78c9796 · 78c9796
1 parent ff08c34
commit 78c9796
Show file tree

Hide file tree

Showing 10 changed files with 294 additions and 24 deletions.
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
@@ -1,18 +1,20 @@
-; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 @A = global [1024 x i16] zeroinitializer, align 128
-@B = global [1024 x i16] zeroinitializer, align 128
+@B = global [1024 x i8] zeroinitializer, align 128
 
 ; CHECK: LV: Checking a loop in "test"
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 17 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 41 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 114 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 228 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i16, i16* %in0, align 2
 
 define void @test() {
 entry:
@@ -32,8 +34,10 @@ for.body:
 
   %reduce.add.0 = add i16 %v0, %v1
 
-  %out = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
-  store i16 %reduce.add.0, i16* %out
+  %reduce.add.0.narrow = trunc i16 %reduce.add.0 to i8
+
+  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.0.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 2
   %cmp = icmp ult i64 %iv.next, 1024

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
@@ -1,18 +1,20 @@
-; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 @A = global [1024 x i16] zeroinitializer, align 128
-@B = global [1024 x i16] zeroinitializer, align 128
+@B = global [1024 x i8] zeroinitializer, align 128
 
 ; CHECK: LV: Checking a loop in "test"
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 15 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 31 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 58 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 342 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i16, i16* %in0, align 2
 
 define void @test() {
 entry:
@@ -36,8 +38,10 @@ for.body:
   %reduce.add.0 = add i16 %v0, %v1
   %reduce.add.1 = add i16 %reduce.add.0, %v2
 
-  %out = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
-  store i16 %reduce.add.1, i16* %out
+  %reduce.add.1.narrow = trunc i16 %reduce.add.1 to i8
+
+  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.1.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 3
   %cmp = icmp ult i64 %iv.next, 1024

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
@@ -1,18 +1,20 @@
-; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 @A = global [1024 x i16] zeroinitializer, align 128
-@B = global [1024 x i16] zeroinitializer, align 128
+@B = global [1024 x i8] zeroinitializer, align 128
 
 ; CHECK: LV: Checking a loop in "test"
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 17 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 41 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 82 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 228 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 456 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i16, i16* %in0, align 2
 
 define void @test() {
 entry:
@@ -40,8 +42,10 @@ for.body:
   %reduce.add.1 = add i16 %reduce.add.0, %v2
   %reduce.add.2 = add i16 %reduce.add.1, %v3
 
-  %out = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
-  store i16 %reduce.add.2, i16* %out
+  %reduce.add.2.narrow = trunc i16 %reduce.add.2 to i8
+
+  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.2.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 4
   %cmp = icmp ult i64 %iv.next, 1024

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
@@ -0,0 +1,59 @@
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [1024 x i16] zeroinitializer, align 128
+@B = global [1024 x i8] zeroinitializer, align 128
+
+; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 26 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 50 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 99 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 285 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i16, i16* %in0, align 2
+
+define void @test() {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+
+  %in0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.4
+
+  %v0 = load i16, i16* %in0
+  %v1 = load i16, i16* %in1
+  %v2 = load i16, i16* %in2
+  %v3 = load i16, i16* %in3
+  %v4 = load i16, i16* %in4
+
+  %reduce.add.0 = add i16 %v0, %v1
+  %reduce.add.1 = add i16 %reduce.add.0, %v2
+  %reduce.add.2 = add i16 %reduce.add.1, %v3
+  %reduce.add.3 = add i16 %reduce.add.2, %v4
+
+  %reduce.add.3.narrow = trunc i16 %reduce.add.3 to i8
+
+  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.3.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 5
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
@@ -0,0 +1,63 @@
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [1024 x i16] zeroinitializer, align 128
+@B = global [1024 x i8] zeroinitializer, align 128
+
+; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 31 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 58 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 123 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 342 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i16, i16* %in0, align 2
+
+define void @test() {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+
+  %in0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.5
+
+  %v0 = load i16, i16* %in0
+  %v1 = load i16, i16* %in1
+  %v2 = load i16, i16* %in2
+  %v3 = load i16, i16* %in3
+  %v4 = load i16, i16* %in4
+  %v5 = load i16, i16* %in5
+
+  %reduce.add.0 = add i16 %v0, %v1
+  %reduce.add.1 = add i16 %reduce.add.0, %v2
+  %reduce.add.2 = add i16 %reduce.add.1, %v3
+  %reduce.add.3 = add i16 %reduce.add.2, %v4
+  %reduce.add.4 = add i16 %reduce.add.3, %v5
+
+  %reduce.add.4.narrow = trunc i16 %reduce.add.4 to i8
+
+  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.4.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 6
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
@@ -1,10 +1,10 @@
-; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-@A = global [1024 x i16] zeroinitializer, align 128
+@A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
 ; CHECK: LV: Checking a loop in "test"
@@ -13,6 +13,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Found an estimated cost of 17 for VF 4 For instruction:   store i16 %v1, i16* %out1, align 2
 ; CHECK: LV: Found an estimated cost of 49 for VF 8 For instruction:   store i16 %v1, i16* %out1, align 2
 ; CHECK: LV: Found an estimated cost of 114 for VF 16 For instruction:   store i16 %v1, i16* %out1, align 2
+; CHECK: LV: Found an estimated cost of 228 for VF 32 For instruction:   store i16 %v1, i16* %out1, align 2
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i16 %v1, i16* %out1, align 2
 
 define void @test() {
 entry:
@@ -24,8 +26,10 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
-  %v = load i16, i16* %in
+  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, i8* %in
+
+  %v = zext i8 %v.narrow to i16
 
   %v0 = add i16 %v, 0
   %v1 = add i16 %v, 1

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
@@ -1,10 +1,10 @@
-; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-@A = global [1024 x i16] zeroinitializer, align 128
+@A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
 ; CHECK: LV: Checking a loop in "test"
@@ -13,6 +13,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Found an estimated cost of 35 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
 ; CHECK: LV: Found an estimated cost of 66 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
 ; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
+; CHECK: LV: Found an estimated cost of 342 for VF 32 For instruction:   store i16 %v2, i16* %out2, align 2
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i16 %v2, i16* %out2, align 2
 
 define void @test() {
 entry:
@@ -25,8 +27,10 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
-  %v = load i16, i16* %in
+  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, i8* %in
+
+  %v = zext i8 %v.narrow to i16
 
   %v0 = add i16 %v, 0
   %v1 = add i16 %v, 1

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
@@ -1,10 +1,10 @@
-; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-@A = global [1024 x i16] zeroinitializer, align 128
+@A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
 ; CHECK: LV: Checking a loop in "test"
@@ -13,6 +13,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Found an estimated cost of 49 for VF 4 For instruction:   store i16 %v3, i16* %out3, align 2
 ; CHECK: LV: Found an estimated cost of 98 for VF 8 For instruction:   store i16 %v3, i16* %out3, align 2
 ; CHECK: LV: Found an estimated cost of 228 for VF 16 For instruction:   store i16 %v3, i16* %out3, align 2
+; CHECK: LV: Found an estimated cost of 456 for VF 32 For instruction:   store i16 %v3, i16* %out3, align 2
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i16 %v3, i16* %out3, align 2
 
 define void @test() {
 entry:
@@ -26,8 +28,10 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
-  %v = load i16, i16* %in
+  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, i8* %in
+
+  %v = zext i8 %v.narrow to i16
 
   %v0 = add i16 %v, 0
   %v1 = add i16 %v, 1

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
@@ -0,0 +1,60 @@
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [1024 x i8] zeroinitializer, align 128
+@B = global [1024 x i16] zeroinitializer, align 128
+
+; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v4, i16* %out4, align 2
+; CHECK: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
+; CHECK: LV: Found an estimated cost of 58 for VF 4 For instruction:   store i16 %v4, i16* %out4, align 2
+; CHECK: LV: Found an estimated cost of 115 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
+; CHECK: LV: Found an estimated cost of 285 for VF 16 For instruction:   store i16 %v4, i16* %out4, align 2
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i16 %v4, i16* %out4, align 2
+
+define void @test() {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+
+  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, i8* %in
+
+  %v = zext i8 %v.narrow to i16
+
+  %v0 = add i16 %v, 0
+  %v1 = add i16 %v, 1
+  %v2 = add i16 %v, 2
+  %v3 = add i16 %v, 3
+  %v4 = add i16 %v, 4
+
+  %out0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.4
+
+  store i16 %v0, i16* %out0
+  store i16 %v1, i16* %out1
+  store i16 %v2, i16* %out2
+  store i16 %v3, i16* %out3
+  store i16 %v4, i16* %out4
+
+  %iv.next = add nuw nsw i64 %iv.0, 5
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}