From 78c9796f963f0577b86de7bf062f65b4595cb757 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 26 May 2021 15:45:10 +0300 Subject: [PATCH] [NFC][X86][Costmodel] Add some more interleaved load/store test with i16 element type Not sure if even larger interleaving factors are needed, but these are what i have seen being queried in the wild. --- .../X86/interleaved-load-i16-stride-2.ll | 12 ++-- .../X86/interleaved-load-i16-stride-3.ll | 12 ++-- .../X86/interleaved-load-i16-stride-4.ll | 12 ++-- .../X86/interleaved-load-i16-stride-5.ll | 59 +++++++++++++++++ .../X86/interleaved-load-i16-stride-6.ll | 63 ++++++++++++++++++ .../X86/interleaved-store-i16-stride-2.ll | 12 ++-- .../X86/interleaved-store-i16-stride-3.ll | 12 ++-- .../X86/interleaved-store-i16-stride-4.ll | 12 ++-- .../X86/interleaved-store-i16-stride-5.ll | 60 +++++++++++++++++ .../X86/interleaved-store-i16-stride-6.ll | 64 +++++++++++++++++++ 10 files changed, 294 insertions(+), 24 deletions(-) create mode 100644 llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll create mode 100644 llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll create mode 100644 llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll create mode 100644 llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll index e3625caa51b87..58f8773f62150 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll @@ -1,11 +1,11 @@ -; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @A = global [1024 x i16] zeroinitializer, align 128 -@B = global [1024 x i16] zeroinitializer, align 128 +@B = global [1024 x i8] zeroinitializer, align 128 ; CHECK: LV: Checking a loop in "test" ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 @@ -13,6 +13,8 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 ; CHECK: LV: Found an estimated cost of 41 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 ; CHECK: LV: Found an estimated cost of 114 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK: LV: Found an estimated cost of 228 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i16, i16* %in0, align 2 define void @test() { entry: @@ -32,8 +34,10 @@ for.body: %reduce.add.0 = add i16 %v0, %v1 - %out = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0 - store i16 %reduce.add.0, i16* %out + %reduce.add.0.narrow = trunc i16 %reduce.add.0 to i8 + + %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0 + store i8 %reduce.add.0.narrow, i8* %out %iv.next = add nuw nsw i64 %iv.0, 2 %cmp = icmp ult i64 %iv.next, 1024 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll index 6c87955f8db0c..945f903d73da1 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll @@ -1,11 +1,11 @@ -; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @A = global [1024 x i16] zeroinitializer, align 128 -@B = global [1024 x i16] zeroinitializer, align 128 +@B = global [1024 x i8] zeroinitializer, align 128 ; CHECK: LV: Checking a loop in "test" ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 @@ -13,6 +13,8 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: LV: Found an estimated cost of 31 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 ; CHECK: LV: Found an estimated cost of 58 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 ; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK: LV: Found an estimated cost of 342 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i16, i16* %in0, align 2 define void @test() { entry: @@ -36,8 +38,10 @@ for.body: %reduce.add.0 = add i16 %v0, %v1 %reduce.add.1 = add i16 %reduce.add.0, %v2 - %out = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0 - store i16 %reduce.add.1, i16* %out + %reduce.add.1.narrow = trunc i16 %reduce.add.1 to i8 + + %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0 + store i8 %reduce.add.1.narrow, i8* %out %iv.next = add nuw nsw i64 %iv.0, 3 %cmp = icmp ult i64 %iv.next, 1024 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll index 4be1910162070..a1682dcf7c50f 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll @@ -1,11 +1,11 @@ -; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @A = global [1024 x i16] zeroinitializer, align 128 -@B = global [1024 x i16] zeroinitializer, align 128 +@B = global [1024 x i8] zeroinitializer, align 128 ; CHECK: LV: Checking a loop in "test" ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 @@ -13,6 +13,8 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: LV: Found an estimated cost of 41 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 ; CHECK: LV: Found an estimated cost of 82 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 ; CHECK: LV: Found an estimated cost of 228 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK: LV: Found an estimated cost of 456 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i16, i16* %in0, align 2 define void @test() { entry: @@ -40,8 +42,10 @@ for.body: %reduce.add.1 = add i16 %reduce.add.0, %v2 %reduce.add.2 = add i16 %reduce.add.1, %v3 - %out = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0 - store i16 %reduce.add.2, i16* %out + %reduce.add.2.narrow = trunc i16 %reduce.add.2 to i8 + + %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0 + store i8 %reduce.add.2.narrow, i8* %out %iv.next = add nuw nsw i64 %iv.0, 4 %cmp = icmp ult i64 %iv.next, 1024 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll new file mode 100644 index 0000000000000..308717d9f4198 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll @@ -0,0 +1,59 @@ +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [1024 x i16] zeroinitializer, align 128 +@B = global [1024 x i8] zeroinitializer, align 128 + +; CHECK: LV: Checking a loop in "test" +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK: LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK: LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK: LV: Found an estimated cost of 285 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i16, i16* %in0, align 2 + +define void @test() { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + + %iv.0 = add nuw nsw i64 %iv, 0 + %iv.1 = add nuw nsw i64 %iv, 1 + %iv.2 = add nuw nsw i64 %iv, 2 + %iv.3 = add nuw nsw i64 %iv, 3 + %iv.4 = add nuw nsw i64 %iv, 4 + + %in0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0 + %in1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.1 + %in2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.2 + %in3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.3 + %in4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.4 + + %v0 = load i16, i16* %in0 + %v1 = load i16, i16* %in1 + %v2 = load i16, i16* %in2 + %v3 = load i16, i16* %in3 + %v4 = load i16, i16* %in4 + + %reduce.add.0 = add i16 %v0, %v1 + %reduce.add.1 = add i16 %reduce.add.0, %v2 + %reduce.add.2 = add i16 %reduce.add.1, %v3 + %reduce.add.3 = add i16 %reduce.add.2, %v4 + + %reduce.add.3.narrow = trunc i16 %reduce.add.3 to i8 + + %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0 + store i8 %reduce.add.3.narrow, i8* %out + + %iv.next = add nuw nsw i64 %iv.0, 5 + %cmp = icmp ult i64 %iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll new file mode 100644 index 0000000000000..ca9d1cada20f6 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll @@ -0,0 +1,63 @@ +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [1024 x i16] zeroinitializer, align 128 +@B = global [1024 x i8] zeroinitializer, align 128 + +; CHECK: LV: Checking a loop in "test" +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK: LV: Found an estimated cost of 31 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK: LV: Found an estimated cost of 58 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK: LV: Found an estimated cost of 123 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK: LV: Found an estimated cost of 342 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i16, i16* %in0, align 2 + +define void @test() { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + + %iv.0 = add nuw nsw i64 %iv, 0 + %iv.1 = add nuw nsw i64 %iv, 1 + %iv.2 = add nuw nsw i64 %iv, 2 + %iv.3 = add nuw nsw i64 %iv, 3 + %iv.4 = add nuw nsw i64 %iv, 4 + %iv.5 = add nuw nsw i64 %iv, 5 + + %in0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0 + %in1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.1 + %in2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.2 + %in3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.3 + %in4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.4 + %in5 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.5 + + %v0 = load i16, i16* %in0 + %v1 = load i16, i16* %in1 + %v2 = load i16, i16* %in2 + %v3 = load i16, i16* %in3 + %v4 = load i16, i16* %in4 + %v5 = load i16, i16* %in5 + + %reduce.add.0 = add i16 %v0, %v1 + %reduce.add.1 = add i16 %reduce.add.0, %v2 + %reduce.add.2 = add i16 %reduce.add.1, %v3 + %reduce.add.3 = add i16 %reduce.add.2, %v4 + %reduce.add.4 = add i16 %reduce.add.3, %v5 + + %reduce.add.4.narrow = trunc i16 %reduce.add.4 to i8 + + %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0 + store i8 %reduce.add.4.narrow, i8* %out + + %iv.next = add nuw nsw i64 %iv.0, 6 + %cmp = icmp ult i64 %iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll index 3f1347c64c41d..d184c5096b841 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll @@ -1,10 +1,10 @@ -; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -@A = global [1024 x i16] zeroinitializer, align 128 +@A = global [1024 x i8] zeroinitializer, align 128 @B = global [1024 x i16] zeroinitializer, align 128 ; CHECK: LV: Checking a loop in "test" @@ -13,6 +13,8 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %v1, i16* %out1, align 2 ; CHECK: LV: Found an estimated cost of 49 for VF 8 For instruction: store i16 %v1, i16* %out1, align 2 ; CHECK: LV: Found an estimated cost of 114 for VF 16 For instruction: store i16 %v1, i16* %out1, align 2 +; CHECK: LV: Found an estimated cost of 228 for VF 32 For instruction: store i16 %v1, i16* %out1, align 2 +; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i16 %v1, i16* %out1, align 2 define void @test() { entry: @@ -24,8 +26,10 @@ for.body: %iv.0 = add nuw nsw i64 %iv, 0 %iv.1 = add nuw nsw i64 %iv, 1 - %in = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0 - %v = load i16, i16* %in + %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0 + %v.narrow = load i8, i8* %in + + %v = zext i8 %v.narrow to i16 %v0 = add i16 %v, 0 %v1 = add i16 %v, 1 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll index a8e2308e6e37f..4235d8f13db33 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll @@ -1,10 +1,10 @@ -; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -@A = global [1024 x i16] zeroinitializer, align 128 +@A = global [1024 x i8] zeroinitializer, align 128 @B = global [1024 x i16] zeroinitializer, align 128 ; CHECK: LV: Checking a loop in "test" @@ -13,6 +13,8 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: LV: Found an estimated cost of 35 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2 ; CHECK: LV: Found an estimated cost of 66 for VF 8 For instruction: store i16 %v2, i16* %out2, align 2 ; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction: store i16 %v2, i16* %out2, align 2 +; CHECK: LV: Found an estimated cost of 342 for VF 32 For instruction: store i16 %v2, i16* %out2, align 2 +; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i16 %v2, i16* %out2, align 2 define void @test() { entry: @@ -25,8 +27,10 @@ for.body: %iv.1 = add nuw nsw i64 %iv, 1 %iv.2 = add nuw nsw i64 %iv, 2 - %in = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0 - %v = load i16, i16* %in + %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0 + %v.narrow = load i8, i8* %in + + %v = zext i8 %v.narrow to i16 %v0 = add i16 %v, 0 %v1 = add i16 %v, 1 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll index 9e03bde324f99..fa04acf7b820c 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll @@ -1,10 +1,10 @@ -; RUN: opt -loop-vectorize -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -@A = global [1024 x i16] zeroinitializer, align 128 +@A = global [1024 x i8] zeroinitializer, align 128 @B = global [1024 x i16] zeroinitializer, align 128 ; CHECK: LV: Checking a loop in "test" @@ -13,6 +13,8 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: LV: Found an estimated cost of 49 for VF 4 For instruction: store i16 %v3, i16* %out3, align 2 ; CHECK: LV: Found an estimated cost of 98 for VF 8 For instruction: store i16 %v3, i16* %out3, align 2 ; CHECK: LV: Found an estimated cost of 228 for VF 16 For instruction: store i16 %v3, i16* %out3, align 2 +; CHECK: LV: Found an estimated cost of 456 for VF 32 For instruction: store i16 %v3, i16* %out3, align 2 +; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i16 %v3, i16* %out3, align 2 define void @test() { entry: @@ -26,8 +28,10 @@ for.body: %iv.2 = add nuw nsw i64 %iv, 2 %iv.3 = add nuw nsw i64 %iv, 3 - %in = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0 - %v = load i16, i16* %in + %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0 + %v.narrow = load i8, i8* %in + + %v = zext i8 %v.narrow to i16 %v0 = add i16 %v, 0 %v1 = add i16 %v, 1 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll new file mode 100644 index 0000000000000..e0899f2646ee1 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll @@ -0,0 +1,60 @@ +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [1024 x i8] zeroinitializer, align 128 +@B = global [1024 x i16] zeroinitializer, align 128 + +; CHECK: LV: Checking a loop in "test" +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, i16* %out4, align 2 +; CHECK: LV: Found an estimated cost of 28 for VF 2 For instruction: store i16 %v4, i16* %out4, align 2 +; CHECK: LV: Found an estimated cost of 58 for VF 4 For instruction: store i16 %v4, i16* %out4, align 2 +; CHECK: LV: Found an estimated cost of 115 for VF 8 For instruction: store i16 %v4, i16* %out4, align 2 +; CHECK: LV: Found an estimated cost of 285 for VF 16 For instruction: store i16 %v4, i16* %out4, align 2 +; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i16 %v4, i16* %out4, align 2 + +define void @test() { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + + %iv.0 = add nuw nsw i64 %iv, 0 + %iv.1 = add nuw nsw i64 %iv, 1 + %iv.2 = add nuw nsw i64 %iv, 2 + %iv.3 = add nuw nsw i64 %iv, 3 + %iv.4 = add nuw nsw i64 %iv, 4 + + %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0 + %v.narrow = load i8, i8* %in + + %v = zext i8 %v.narrow to i16 + + %v0 = add i16 %v, 0 + %v1 = add i16 %v, 1 + %v2 = add i16 %v, 2 + %v3 = add i16 %v, 3 + %v4 = add i16 %v, 4 + + %out0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0 + %out1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.1 + %out2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.2 + %out3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.3 + %out4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.4 + + store i16 %v0, i16* %out0 + store i16 %v1, i16* %out1 + store i16 %v2, i16* %out2 + store i16 %v3, i16* %out3 + store i16 %v4, i16* %out4 + + %iv.next = add nuw nsw i64 %iv.0, 5 + %cmp = icmp ult i64 %iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll new file mode 100644 index 0000000000000..2ee01338400b8 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll @@ -0,0 +1,64 @@ +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [1024 x i8] zeroinitializer, align 128 +@B = global [1024 x i16] zeroinitializer, align 128 + +; CHECK: LV: Checking a loop in "test" +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, i16* %out5, align 2 +; CHECK: LV: Found an estimated cost of 35 for VF 2 For instruction: store i16 %v5, i16* %out5, align 2 +; CHECK: LV: Found an estimated cost of 66 for VF 4 For instruction: store i16 %v5, i16* %out5, align 2 +; CHECK: LV: Found an estimated cost of 147 for VF 8 For instruction: store i16 %v5, i16* %out5, align 2 +; CHECK: LV: Found an estimated cost of 342 for VF 16 For instruction: store i16 %v5, i16* %out5, align 2 +; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i16 %v5, i16* %out5, align 2 + +define void @test() { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + + %iv.0 = add nuw nsw i64 %iv, 0 + %iv.1 = add nuw nsw i64 %iv, 1 + %iv.2 = add nuw nsw i64 %iv, 2 + %iv.3 = add nuw nsw i64 %iv, 3 + %iv.4 = add nuw nsw i64 %iv, 4 + %iv.5 = add nuw nsw i64 %iv, 5 + + %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0 + %v.narrow = load i8, i8* %in + + %v = zext i8 %v.narrow to i16 + + %v0 = add i16 %v, 0 + %v1 = add i16 %v, 1 + %v2 = add i16 %v, 2 + %v3 = add i16 %v, 3 + %v4 = add i16 %v, 4 + %v5 = add i16 %v, 5 + + %out0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0 + %out1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.1 + %out2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.2 + %out3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.3 + %out4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.4 + %out5 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.5 + + store i16 %v0, i16* %out0 + store i16 %v1, i16* %out1 + store i16 %v2, i16* %out2 + store i16 %v3, i16* %out3 + store i16 %v4, i16* %out4 + store i16 %v5, i16* %out5 + + %iv.next = add nuw nsw i64 %iv.0, 6 + %cmp = icmp ult i64 %iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +}