Skip to content

Commit

Permalink
[NFC][X86][Costmodel] Add some more interleaved load/store test with …
Browse files Browse the repository at this point in the history
…i16 element type

Not sure if even larger interleaving factors are needed,
but these are what i have seen being queried in the wild.
  • Loading branch information
LebedevRI committed May 26, 2021
1 parent ff08c34 commit 78c9796
Show file tree
Hide file tree
Showing 10 changed files with 294 additions and 24 deletions.
@@ -1,18 +1,20 @@
; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i16] zeroinitializer, align 128
@B = global [1024 x i16] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 41 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 114 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 228 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i16, i16* %in0, align 2

define void @test() {
entry:
Expand All @@ -32,8 +34,10 @@ for.body:

%reduce.add.0 = add i16 %v0, %v1

%out = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
store i16 %reduce.add.0, i16* %out
%reduce.add.0.narrow = trunc i16 %reduce.add.0 to i8

%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.0.narrow, i8* %out

%iv.next = add nuw nsw i64 %iv.0, 2
%cmp = icmp ult i64 %iv.next, 1024
Expand Down
@@ -1,18 +1,20 @@
; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i16] zeroinitializer, align 128
@B = global [1024 x i16] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 31 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 58 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 342 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i16, i16* %in0, align 2

define void @test() {
entry:
Expand All @@ -36,8 +38,10 @@ for.body:
%reduce.add.0 = add i16 %v0, %v1
%reduce.add.1 = add i16 %reduce.add.0, %v2

%out = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
store i16 %reduce.add.1, i16* %out
%reduce.add.1.narrow = trunc i16 %reduce.add.1 to i8

%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.1.narrow, i8* %out

%iv.next = add nuw nsw i64 %iv.0, 3
%cmp = icmp ult i64 %iv.next, 1024
Expand Down
@@ -1,18 +1,20 @@
; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i16] zeroinitializer, align 128
@B = global [1024 x i16] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 41 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 82 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 228 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 456 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i16, i16* %in0, align 2

define void @test() {
entry:
Expand Down Expand Up @@ -40,8 +42,10 @@ for.body:
%reduce.add.1 = add i16 %reduce.add.0, %v2
%reduce.add.2 = add i16 %reduce.add.1, %v3

%out = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
store i16 %reduce.add.2, i16* %out
%reduce.add.2.narrow = trunc i16 %reduce.add.2 to i8

%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.2.narrow, i8* %out

%iv.next = add nuw nsw i64 %iv.0, 4
%cmp = icmp ult i64 %iv.next, 1024
Expand Down
59 changes: 59 additions & 0 deletions llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
@@ -0,0 +1,59 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i16] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 285 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i16, i16* %in0, align 2

define void @test() {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]

%iv.0 = add nuw nsw i64 %iv, 0
%iv.1 = add nuw nsw i64 %iv, 1
%iv.2 = add nuw nsw i64 %iv, 2
%iv.3 = add nuw nsw i64 %iv, 3
%iv.4 = add nuw nsw i64 %iv, 4

%in0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
%in1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.1
%in2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.2
%in3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.3
%in4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.4

%v0 = load i16, i16* %in0
%v1 = load i16, i16* %in1
%v2 = load i16, i16* %in2
%v3 = load i16, i16* %in3
%v4 = load i16, i16* %in4

%reduce.add.0 = add i16 %v0, %v1
%reduce.add.1 = add i16 %reduce.add.0, %v2
%reduce.add.2 = add i16 %reduce.add.1, %v3
%reduce.add.3 = add i16 %reduce.add.2, %v4

%reduce.add.3.narrow = trunc i16 %reduce.add.3 to i8

%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.3.narrow, i8* %out

%iv.next = add nuw nsw i64 %iv.0, 5
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup:
ret void
}
63 changes: 63 additions & 0 deletions llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
@@ -0,0 +1,63 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i16] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 31 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 58 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 123 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 342 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i16, i16* %in0, align 2

define void @test() {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]

%iv.0 = add nuw nsw i64 %iv, 0
%iv.1 = add nuw nsw i64 %iv, 1
%iv.2 = add nuw nsw i64 %iv, 2
%iv.3 = add nuw nsw i64 %iv, 3
%iv.4 = add nuw nsw i64 %iv, 4
%iv.5 = add nuw nsw i64 %iv, 5

%in0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
%in1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.1
%in2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.2
%in3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.3
%in4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.4
%in5 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.5

%v0 = load i16, i16* %in0
%v1 = load i16, i16* %in1
%v2 = load i16, i16* %in2
%v3 = load i16, i16* %in3
%v4 = load i16, i16* %in4
%v5 = load i16, i16* %in5

%reduce.add.0 = add i16 %v0, %v1
%reduce.add.1 = add i16 %reduce.add.0, %v2
%reduce.add.2 = add i16 %reduce.add.1, %v3
%reduce.add.3 = add i16 %reduce.add.2, %v4
%reduce.add.4 = add i16 %reduce.add.3, %v5

%reduce.add.4.narrow = trunc i16 %reduce.add.4 to i8

%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.4.narrow, i8* %out

%iv.next = add nuw nsw i64 %iv.0, 6
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup:
ret void
}
@@ -1,10 +1,10 @@
; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i16] zeroinitializer, align 128
@A = global [1024 x i8] zeroinitializer, align 128
@B = global [1024 x i16] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
Expand All @@ -13,6 +13,8 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %v1, i16* %out1, align 2
; CHECK: LV: Found an estimated cost of 49 for VF 8 For instruction: store i16 %v1, i16* %out1, align 2
; CHECK: LV: Found an estimated cost of 114 for VF 16 For instruction: store i16 %v1, i16* %out1, align 2
; CHECK: LV: Found an estimated cost of 228 for VF 32 For instruction: store i16 %v1, i16* %out1, align 2
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i16 %v1, i16* %out1, align 2

define void @test() {
entry:
Expand All @@ -24,8 +26,10 @@ for.body:
%iv.0 = add nuw nsw i64 %iv, 0
%iv.1 = add nuw nsw i64 %iv, 1

%in = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
%v = load i16, i16* %in
%in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
%v.narrow = load i8, i8* %in

%v = zext i8 %v.narrow to i16

%v0 = add i16 %v, 0
%v1 = add i16 %v, 1
Expand Down
@@ -1,10 +1,10 @@
; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i16] zeroinitializer, align 128
@A = global [1024 x i8] zeroinitializer, align 128
@B = global [1024 x i16] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
Expand All @@ -13,6 +13,8 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: LV: Found an estimated cost of 35 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2
; CHECK: LV: Found an estimated cost of 66 for VF 8 For instruction: store i16 %v2, i16* %out2, align 2
; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction: store i16 %v2, i16* %out2, align 2
; CHECK: LV: Found an estimated cost of 342 for VF 32 For instruction: store i16 %v2, i16* %out2, align 2
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i16 %v2, i16* %out2, align 2

define void @test() {
entry:
Expand All @@ -25,8 +27,10 @@ for.body:
%iv.1 = add nuw nsw i64 %iv, 1
%iv.2 = add nuw nsw i64 %iv, 2

%in = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
%v = load i16, i16* %in
%in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
%v.narrow = load i8, i8* %in

%v = zext i8 %v.narrow to i16

%v0 = add i16 %v, 0
%v1 = add i16 %v, 1
Expand Down
@@ -1,10 +1,10 @@
; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i16] zeroinitializer, align 128
@A = global [1024 x i8] zeroinitializer, align 128
@B = global [1024 x i16] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
Expand All @@ -13,6 +13,8 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: LV: Found an estimated cost of 49 for VF 4 For instruction: store i16 %v3, i16* %out3, align 2
; CHECK: LV: Found an estimated cost of 98 for VF 8 For instruction: store i16 %v3, i16* %out3, align 2
; CHECK: LV: Found an estimated cost of 228 for VF 16 For instruction: store i16 %v3, i16* %out3, align 2
; CHECK: LV: Found an estimated cost of 456 for VF 32 For instruction: store i16 %v3, i16* %out3, align 2
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i16 %v3, i16* %out3, align 2

define void @test() {
entry:
Expand All @@ -26,8 +28,10 @@ for.body:
%iv.2 = add nuw nsw i64 %iv, 2
%iv.3 = add nuw nsw i64 %iv, 3

%in = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
%v = load i16, i16* %in
%in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
%v.narrow = load i8, i8* %in

%v = zext i8 %v.narrow to i16

%v0 = add i16 %v, 0
%v1 = add i16 %v, 1
Expand Down
60 changes: 60 additions & 0 deletions llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
@@ -0,0 +1,60 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
; REQUIRES: asserts

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@A = global [1024 x i8] zeroinitializer, align 128
@B = global [1024 x i16] zeroinitializer, align 128

; CHECK: LV: Checking a loop in "test"
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, i16* %out4, align 2
; CHECK: LV: Found an estimated cost of 28 for VF 2 For instruction: store i16 %v4, i16* %out4, align 2
; CHECK: LV: Found an estimated cost of 58 for VF 4 For instruction: store i16 %v4, i16* %out4, align 2
; CHECK: LV: Found an estimated cost of 115 for VF 8 For instruction: store i16 %v4, i16* %out4, align 2
; CHECK: LV: Found an estimated cost of 285 for VF 16 For instruction: store i16 %v4, i16* %out4, align 2
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i16 %v4, i16* %out4, align 2

define void @test() {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]

%iv.0 = add nuw nsw i64 %iv, 0
%iv.1 = add nuw nsw i64 %iv, 1
%iv.2 = add nuw nsw i64 %iv, 2
%iv.3 = add nuw nsw i64 %iv, 3
%iv.4 = add nuw nsw i64 %iv, 4

%in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
%v.narrow = load i8, i8* %in

%v = zext i8 %v.narrow to i16

%v0 = add i16 %v, 0
%v1 = add i16 %v, 1
%v2 = add i16 %v, 2
%v3 = add i16 %v, 3
%v4 = add i16 %v, 4

%out0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
%out1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.1
%out2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.2
%out3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.3
%out4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.4

store i16 %v0, i16* %out0
store i16 %v1, i16* %out1
store i16 %v2, i16* %out2
store i16 %v3, i16* %out3
store i16 %v4, i16* %out4

%iv.next = add nuw nsw i64 %iv.0, 5
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup:
ret void
}

0 comments on commit 78c9796

Please sign in to comment.