Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[NFC][X86][LV] Add costmodel test coverage for interleaved i32/f32 lo…
…ad/store stride=6
- Loading branch information
Showing
4 changed files
with
348 additions
and
0 deletions.
There are no files selected for viewing
86 changes: 86 additions & 0 deletions
86
llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-6.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2 | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1 | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2 | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512 | ||
; REQUIRES: asserts | ||
|
||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" | ||
target triple = "x86_64-unknown-linux-gnu" | ||
|
||
@A = global [1024 x float] zeroinitializer, align 128 | ||
@B = global [1024 x i8] zeroinitializer, align 128 | ||
|
||
; CHECK: LV: Checking a loop in "test" | ||
; | ||
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, float* %in0, align 4 | ||
; SSE2: LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load float, float* %in0, align 4 | ||
; SSE2: LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load float, float* %in0, align 4 | ||
; SSE2: LV: Found an estimated cost of 84 for VF 8 For instruction: %v0 = load float, float* %in0, align 4 | ||
; | ||
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX1: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX1: LV: Found an estimated cost of 51 for VF 4 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX1: LV: Found an estimated cost of 114 for VF 8 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction: %v0 = load float, float* %in0, align 4 | ||
; | ||
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX2: LV: Found an estimated cost of 51 for VF 4 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX2: LV: Found an estimated cost of 114 for VF 8 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX2: LV: Found an estimated cost of 228 for VF 16 For instruction: %v0 = load float, float* %in0, align 4 | ||
; | ||
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX512: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load float, float* %in0, align 4 | ||
; AVX512: LV: Found an estimated cost of 210 for VF 32 For instruction: %v0 = load float, float* %in0, align 4 | ||
; | ||
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load float, float* %in0, align 4 | ||
|
||
define void @test() { | ||
entry: | ||
br label %for.body | ||
|
||
for.body: | ||
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] | ||
|
||
%iv.0 = add nuw nsw i64 %iv, 0 | ||
%iv.1 = add nuw nsw i64 %iv, 1 | ||
%iv.2 = add nuw nsw i64 %iv, 2 | ||
%iv.3 = add nuw nsw i64 %iv, 3 | ||
%iv.4 = add nuw nsw i64 %iv, 4 | ||
%iv.5 = add nuw nsw i64 %iv, 5 | ||
|
||
%in0 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.0 | ||
%in1 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.1 | ||
%in2 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.2 | ||
%in3 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.3 | ||
%in4 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.4 | ||
%in5 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.5 | ||
|
||
%v0 = load float, float* %in0 | ||
%v1 = load float, float* %in1 | ||
%v2 = load float, float* %in2 | ||
%v3 = load float, float* %in3 | ||
%v4 = load float, float* %in4 | ||
%v5 = load float, float* %in5 | ||
|
||
%reduce.add.0 = fadd float %v0, %v1 | ||
%reduce.add.1 = fadd float %reduce.add.0, %v2 | ||
%reduce.add.2 = fadd float %reduce.add.1, %v3 | ||
%reduce.add.3 = fadd float %reduce.add.2, %v4 | ||
%reduce.add.4 = fadd float %reduce.add.3, %v5 | ||
|
||
%reduce.add.4.narrow = fptoui float %reduce.add.4 to i8 | ||
|
||
%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0 | ||
store i8 %reduce.add.4.narrow, i8* %out | ||
|
||
%iv.next = add nuw nsw i64 %iv.0, 6 | ||
%cmp = icmp ult i64 %iv.next, 1024 | ||
br i1 %cmp, label %for.body, label %for.cond.cleanup | ||
|
||
for.cond.cleanup: | ||
ret void | ||
} |
86 changes: 86 additions & 0 deletions
86
llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-6.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2 | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1 | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2 | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512 | ||
; REQUIRES: asserts | ||
|
||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" | ||
target triple = "x86_64-unknown-linux-gnu" | ||
|
||
@A = global [1024 x i32] zeroinitializer, align 128 | ||
@B = global [1024 x i8] zeroinitializer, align 128 | ||
|
||
; CHECK: LV: Checking a loop in "test" | ||
; | ||
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; SSE2: LV: Found an estimated cost of 42 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; SSE2: LV: Found an estimated cost of 90 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; SSE2: LV: Found an estimated cost of 180 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; | ||
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX1: LV: Found an estimated cost of 30 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX1: LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX1: LV: Found an estimated cost of 138 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX1: LV: Found an estimated cost of 276 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; | ||
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX2: LV: Found an estimated cost of 30 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX2: LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX2: LV: Found an estimated cost of 138 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX2: LV: Found an estimated cost of 276 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; | ||
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX512: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; AVX512: LV: Found an estimated cost of 210 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 | ||
; | ||
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i32, i32* %in0, align 4 | ||
|
||
define void @test() { | ||
entry: | ||
br label %for.body | ||
|
||
for.body: | ||
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] | ||
|
||
%iv.0 = add nuw nsw i64 %iv, 0 | ||
%iv.1 = add nuw nsw i64 %iv, 1 | ||
%iv.2 = add nuw nsw i64 %iv, 2 | ||
%iv.3 = add nuw nsw i64 %iv, 3 | ||
%iv.4 = add nuw nsw i64 %iv, 4 | ||
%iv.5 = add nuw nsw i64 %iv, 5 | ||
|
||
%in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0 | ||
%in1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.1 | ||
%in2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.2 | ||
%in3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.3 | ||
%in4 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.4 | ||
%in5 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.5 | ||
|
||
%v0 = load i32, i32* %in0 | ||
%v1 = load i32, i32* %in1 | ||
%v2 = load i32, i32* %in2 | ||
%v3 = load i32, i32* %in3 | ||
%v4 = load i32, i32* %in4 | ||
%v5 = load i32, i32* %in5 | ||
|
||
%reduce.add.0 = add i32 %v0, %v1 | ||
%reduce.add.1 = add i32 %reduce.add.0, %v2 | ||
%reduce.add.2 = add i32 %reduce.add.1, %v3 | ||
%reduce.add.3 = add i32 %reduce.add.2, %v4 | ||
%reduce.add.4 = add i32 %reduce.add.3, %v5 | ||
|
||
%reduce.add.4.narrow = trunc i32 %reduce.add.4 to i8 | ||
|
||
%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0 | ||
store i8 %reduce.add.4.narrow, i8* %out | ||
|
||
%iv.next = add nuw nsw i64 %iv.0, 6 | ||
%cmp = icmp ult i64 %iv.next, 1024 | ||
br i1 %cmp, label %for.body, label %for.cond.cleanup | ||
|
||
for.cond.cleanup: | ||
ret void | ||
} |
88 changes: 88 additions & 0 deletions
88
llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-6.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2 | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1 | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2 | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512 | ||
; REQUIRES: asserts | ||
|
||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" | ||
target triple = "x86_64-unknown-linux-gnu" | ||
|
||
@A = global [1024 x i8] zeroinitializer, align 128 | ||
@B = global [1024 x float] zeroinitializer, align 128 | ||
|
||
; CHECK: LV: Checking a loop in "test" | ||
; | ||
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, float* %out5, align 4 | ||
; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction: store float %v5, float* %out5, align 4 | ||
; SSE2: LV: Found an estimated cost of 48 for VF 4 For instruction: store float %v5, float* %out5, align 4 | ||
; SSE2: LV: Found an estimated cost of 96 for VF 8 For instruction: store float %v5, float* %out5, align 4 | ||
; | ||
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX1: LV: Found an estimated cost of 20 for VF 2 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX1: LV: Found an estimated cost of 45 for VF 4 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX1: LV: Found an estimated cost of 114 for VF 8 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction: store float %v5, float* %out5, align 4 | ||
; | ||
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX2: LV: Found an estimated cost of 45 for VF 4 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX2: LV: Found an estimated cost of 114 for VF 8 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX2: LV: Found an estimated cost of 228 for VF 16 For instruction: store float %v5, float* %out5, align 4 | ||
; | ||
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX512: LV: Found an estimated cost of 17 for VF 4 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX512: LV: Found an estimated cost of 25 for VF 8 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX512: LV: Found an estimated cost of 102 for VF 32 For instruction: store float %v5, float* %out5, align 4 | ||
; AVX512: LV: Found an estimated cost of 204 for VF 64 For instruction: store float %v5, float* %out5, align 4 | ||
; | ||
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store float %v5, float* %out5, align 4 | ||
|
||
define void @test() { | ||
entry: | ||
br label %for.body | ||
|
||
for.body: | ||
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] | ||
|
||
%iv.0 = add nuw nsw i64 %iv, 0 | ||
%iv.1 = add nuw nsw i64 %iv, 1 | ||
%iv.2 = add nuw nsw i64 %iv, 2 | ||
%iv.3 = add nuw nsw i64 %iv, 3 | ||
%iv.4 = add nuw nsw i64 %iv, 4 | ||
%iv.5 = add nuw nsw i64 %iv, 5 | ||
|
||
%in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0 | ||
%v.narrow = load i8, i8* %in | ||
|
||
%v = uitofp i8 %v.narrow to float | ||
|
||
%v0 = fadd float %v, 0.0 | ||
%v1 = fadd float %v, 1.0 | ||
%v2 = fadd float %v, 2.0 | ||
%v3 = fadd float %v, 3.0 | ||
%v4 = fadd float %v, 4.0 | ||
%v5 = fadd float %v, 5.0 | ||
|
||
%out0 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.0 | ||
%out1 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.1 | ||
%out2 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.2 | ||
%out3 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.3 | ||
%out4 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.4 | ||
%out5 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.5 | ||
|
||
store float %v0, float* %out0 | ||
store float %v1, float* %out1 | ||
store float %v2, float* %out2 | ||
store float %v3, float* %out3 | ||
store float %v4, float* %out4 | ||
store float %v5, float* %out5 | ||
|
||
%iv.next = add nuw nsw i64 %iv.0, 6 | ||
%cmp = icmp ult i64 %iv.next, 1024 | ||
br i1 %cmp, label %for.body, label %for.cond.cleanup | ||
|
||
for.cond.cleanup: | ||
ret void | ||
} |
88 changes: 88 additions & 0 deletions
88
llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-6.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2 | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1 | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2 | ||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512 | ||
; REQUIRES: asserts | ||
|
||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" | ||
target triple = "x86_64-unknown-linux-gnu" | ||
|
||
@A = global [1024 x i8] zeroinitializer, align 128 | ||
@B = global [1024 x i32] zeroinitializer, align 128 | ||
|
||
; CHECK: LV: Checking a loop in "test" | ||
; | ||
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; SSE2: LV: Found an estimated cost of 45 for VF 2 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; SSE2: LV: Found an estimated cost of 96 for VF 4 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; SSE2: LV: Found an estimated cost of 192 for VF 8 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; | ||
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX1: LV: Found an estimated cost of 29 for VF 2 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX1: LV: Found an estimated cost of 57 for VF 4 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX1: LV: Found an estimated cost of 138 for VF 8 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX1: LV: Found an estimated cost of 276 for VF 16 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; | ||
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX2: LV: Found an estimated cost of 29 for VF 2 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX2: LV: Found an estimated cost of 57 for VF 4 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX2: LV: Found an estimated cost of 138 for VF 8 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX2: LV: Found an estimated cost of 276 for VF 16 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; | ||
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX512: LV: Found an estimated cost of 17 for VF 4 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX512: LV: Found an estimated cost of 25 for VF 8 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX512: LV: Found an estimated cost of 102 for VF 32 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; AVX512: LV: Found an estimated cost of 204 for VF 64 For instruction: store i32 %v5, i32* %out5, align 4 | ||
; | ||
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i32 %v5, i32* %out5, align 4 | ||
|
||
define void @test() { | ||
entry: | ||
br label %for.body | ||
|
||
for.body: | ||
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] | ||
|
||
%iv.0 = add nuw nsw i64 %iv, 0 | ||
%iv.1 = add nuw nsw i64 %iv, 1 | ||
%iv.2 = add nuw nsw i64 %iv, 2 | ||
%iv.3 = add nuw nsw i64 %iv, 3 | ||
%iv.4 = add nuw nsw i64 %iv, 4 | ||
%iv.5 = add nuw nsw i64 %iv, 5 | ||
|
||
%in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0 | ||
%v.narrow = load i8, i8* %in | ||
|
||
%v = zext i8 %v.narrow to i32 | ||
|
||
%v0 = add i32 %v, 0 | ||
%v1 = add i32 %v, 1 | ||
%v2 = add i32 %v, 2 | ||
%v3 = add i32 %v, 3 | ||
%v4 = add i32 %v, 4 | ||
%v5 = add i32 %v, 5 | ||
|
||
%out0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.0 | ||
%out1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.1 | ||
%out2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.2 | ||
%out3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.3 | ||
%out4 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.4 | ||
%out5 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.5 | ||
|
||
store i32 %v0, i32* %out0 | ||
store i32 %v1, i32* %out1 | ||
store i32 %v2, i32* %out2 | ||
store i32 %v3, i32* %out3 | ||
store i32 %v4, i32* %out4 | ||
store i32 %v5, i32* %out5 | ||
|
||
%iv.next = add nuw nsw i64 %iv.0, 6 | ||
%cmp = icmp ult i64 %iv.next, 1024 | ||
br i1 %cmp, label %for.body, label %for.cond.cleanup | ||
|
||
for.cond.cleanup: | ||
ret void | ||
} |