Skip to content

Commit

Permalink
[NFC][X86][LV] Add basic costmodel test coverage for not-fully-interl…
Browse files Browse the repository at this point in the history
…eaved i32 loads

The coverage could have cumulative explosion here,
so i'm adding only the most basic cases,
and hoping it's enough, though more can be added if needed.
  • Loading branch information
LebedevRI committed Oct 5, 2021
1 parent 16b8f4d commit 200edc1
Show file tree
Hide file tree
Showing 6 changed files with 441 additions and 0 deletions.
@@ -0,0 +1,71 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i32] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, i32* %in0, align 4
;
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i32, i32* %in0, align 2

define void @test() {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]

%iv.0 = add nuw nsw i64 %iv, 0

%in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0

%v0 = load i32, i32* %in0

%reduce.add.0 = add i32 %v0, 0

%reduce.add.0.narrow = trunc i32 %reduce.add.0 to i8

%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.0.narrow, i8* %out

%iv.next = add nuw nsw i64 %iv.0, 2
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup:
ret void
}
@@ -0,0 +1,75 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i32] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, i32* %in0, align 4
;
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i32, i32* %in0, align 4

define void @test() {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]

%iv.0 = add nuw nsw i64 %iv, 0
%iv.1 = add nuw nsw i64 %iv, 1

%in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0
%in1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.1

%v0 = load i32, i32* %in0
%v1 = load i32, i32* %in1

%reduce.add.0 = add i32 %v0, %v1
%reduce.add.1 = add i32 %reduce.add.0, 0

%reduce.add.1.narrow = trunc i32 %reduce.add.1 to i8

%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.1.narrow, i8* %out

%iv.next = add nuw nsw i64 %iv.0, 3
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup:
ret void
}
@@ -0,0 +1,70 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i32] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, i32* %in0, align 4

define void @test() {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]

%iv.0 = add nuw nsw i64 %iv, 0

%in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0

%v0 = load i32, i32* %in0

%reduce.add.0 = add i32 %v0, 0
%reduce.add.1 = add i32 %reduce.add.0, 0

%reduce.add.1.narrow = trunc i32 %reduce.add.1 to i8

%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.1.narrow, i8* %out

%iv.next = add nuw nsw i64 %iv.0, 3
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup:
ret void
}
@@ -0,0 +1,76 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i32] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, i32* %in0, align 4
;
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i32, i32* %in0, align 4

define void @test() {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]

%iv.0 = add nuw nsw i64 %iv, 0
%iv.1 = add nuw nsw i64 %iv, 1
%iv.2 = add nuw nsw i64 %iv, 2

%in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0
%in1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.1
%in2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.2

%v0 = load i32, i32* %in0
%v1 = load i32, i32* %in1
%v2 = load i32, i32* %in2

%reduce.add.0 = add i32 %v0, %v1
%reduce.add.1 = add i32 %reduce.add.0, %v2
%reduce.add.2 = add i32 %reduce.add.1, 0

%reduce.add.2.narrow = trunc i32 %reduce.add.2 to i8

%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.2.narrow, i8* %out

%iv.next = add nuw nsw i64 %iv.0, 4
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup:
ret void
}

0 comments on commit 200edc1

Please sign in to comment.