diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll new file mode 100644 index 000000000000..43d54d12aad4 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll @@ -0,0 +1,204 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mcpu=skx -S -loop-vectorize -enable-interleaved-mem-accesses -prefer-predicate-over-epilogue=predicate-dont-vectorize --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED +; RUN: opt -mcpu=skx -S -loop-vectorize -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -prefer-predicate-over-epilogue=predicate-dont-vectorize --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; (1) Interleave-group with factor 4, storing only 2 members out of the 4. +; Check that when we allow masked-memops to support interleave-group with gaps, +; the store is vectorized using a wide masked store, with a 1,1,0,0,1,1,0,0,... mask. +; Check that when we don't allow masked-memops to support interleave-group with gaps, +; the store is scalarized. +; The input IR was generated from this source: +; for(i=0;i<1024;i++){ +; points[i*4] = x[i]; +; points[i*4 + 1] = y[i]; +; } +; (relates to the testcase in PR50566) + +; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test1" +; +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 54 for VF 4 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 54 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 110 for VF 8 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 110 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 228 for VF 16 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 228 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2 + +; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test1" +; +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 9 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 40 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 16 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 96 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2 + +define void @test1(i16* noalias nocapture %points, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 2 + %1 = shl nuw nsw i64 %indvars.iv, 2 + %arrayidx2 = getelementptr inbounds i16, i16* %points, i64 %1 + store i16 %0, i16* %arrayidx2, align 2 + %arrayidx4 = getelementptr inbounds i16, i16* %y, i64 %indvars.iv + %2 = load i16, i16* %arrayidx4, align 2 + %3 = or i64 %1, 1 + %arrayidx7 = getelementptr inbounds i16, i16* %points, i64 %3 + store i16 %2, i16* %arrayidx7, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +; (2) Same as above, but this time the gaps mask of the store is also And-ed with the +; fold-tail mask. If using masked memops to vectorize interleaved-group with gaps is +; not allowed, the store is scalarized and predicated. +; The input IR was generated from this source: +; for(i=0;i 0). +; If using masked memops to vectorize interleaved-group with gaps is +; not allowed, the store is scalarized and predicated. +; Here the Interleave-group is with factor 3, storing only 1 member out of the 3. +; The input IR was generated from this source: +; for(i=0;i<1024;i++){ +; if (x[i] > 0) +; points[i*3] = x[i]; +; } + +; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test" +; +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, i16* %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, i16* %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, i16* %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, i16* %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 16 For instruction: store i16 %0, i16* %arrayidx6, align 2 + +; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test" +; +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, i16* %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, i16* %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, i16* %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, i16* %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 16 For instruction: store i16 %0, i16* %arrayidx6, align 2 + +define void @test(i16* noalias nocapture %points, i16* noalias nocapture readonly %x, i16* noalias nocapture readnone %y) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 2 + %cmp1 = icmp sgt i16 %0, 0 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %1 = mul nuw nsw i64 %indvars.iv, 3 + %arrayidx6 = getelementptr inbounds i16, i16* %points, i64 %1 + store i16 %0, i16* %arrayidx6, align 2 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +}