Skip to content

Commit

Permalink
[LoopVectorize] Take vscale into account when deciding to create epil…
Browse files Browse the repository at this point in the history
…ogues

In LoopVectorizationCostModel::isEpilogueVectorizationProfitable we
check to see if the chosen main vector loop VF >= 16. If so, we
decide to create a vector epilogue loop. However, this doesn't
take VScaleForTuning into account because we could be targeting a
CPU where vscale > 1, and hence the runtime VF would be a multiple
of the known minimum value.

This patch multiplies scalable VFs by VScaleForTuning and several
tests have been updated that now produce vector epilogues.

Differential Revision: https://reviews.llvm.org/D147522
  • Loading branch information
david-arm committed Apr 17, 2023
1 parent f2d03a2 commit 69ee653
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 12 deletions.
8 changes: 5 additions & 3 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5619,9 +5619,11 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
// consider interleaving beneficial (eg. MVE).
if (TTI.getMaxInterleaveFactor(VF) <= 1)
return false;
// FIXME: We should consider changing the threshold for scalable
// vectors to take VScaleForTuning into account.
if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)

unsigned Multiplier = 1;
if (VF.isScalable())
Multiplier = getVScaleForTuning().value_or(1);
if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
return true;
return false;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -S %s | FileCheck --check-prefixes=CHECK,DEFAULT %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefixes=CHECK,THRESHOLD %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 \
; RUN: -enable-epilogue-vectorization=false -S %s | FileCheck --check-prefixes=CHECK,DEFAULT %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 \
; RUN: -enable-epilogue-vectorization=false -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefixes=CHECK,THRESHOLD %s

; Tests for loops with large numbers of runtime checks. Check that loops are
; vectorized, if the loop trip counts are large and the impact of the runtime
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
; RUN: -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
; RUN: -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
; RUN: -mcpu=cortex-x2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG

target triple = "aarch64-unknown-linux-gnu"

define void @foo(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i64 %len) #0 {
; CHECK-EPILOG: vec.epilog.ph:
; CHECK-EPILOG: vec.epilog.vector.body:
; CHECK-EPILOG: load <vscale x 4 x i16>

; CHECK-NO-EPILOG-NOT: vec.epilog.vector.ph:
; CHECK-NO-EPILOG-NOT: vec.epilog.vector.body:
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i16, ptr %p, i64 %indvars.iv
%0 = load i16, ptr %arrayidx
%add = add nuw nsw i16 %0, 2
%arrayidx3 = getelementptr inbounds i16, ptr %q, i64 %indvars.iv
store i16 %add, ptr %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %len
br i1 %exitcond, label %exit, label %for.body

exit: ; preds = %for.body
ret void
}

attributes #0 = { "target-features"="+sve" vscale_range(1,16) }
9 changes: 5 additions & 4 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=loop-vectorize,dce -mtriple aarch64-linux-gnu -mattr=+sve \
; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s -S | FileCheck %s
; RUN: opt -passes=loop-vectorize,dce -prefer-predicate-over-epilogue=scalar-epilogue \
; RUN: -enable-epilogue-vectorization=false < %s -S | FileCheck %s

target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"

; This should be vscale x 8 vectorized, maybe with some interleaving.

define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef readonly %s, i32 noundef %n) {
define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef readonly %s, i32 noundef %n) #0 {
; CHECK-LABEL: @fneg(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[S2:%.*]] = ptrtoint ptr [[S:%.*]] to i64
Expand Down Expand Up @@ -100,3 +99,5 @@ for.body: ; preds = %for.body.preheader,
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}

attributes #0 = { "target-features"="+sve" vscale_range(1,16) }
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
; REQUIRES: asserts
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
; RUN: -debug-only=loop-vectorize 2>%t < %s | FileCheck %s
; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize 2>%t < %s | FileCheck %s
; RUN: cat %t | FileCheck %s --check-prefix=CHECK-COST

target triple = "aarch64-unknown-linux-gnu"
Expand All @@ -17,7 +17,6 @@ define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur
; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %conv = zext i8 %0 to i32
; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %conv = zext i8 %0 to i32
; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction: %conv = zext i8 %0 to i32

; CHECK-LABEL: define void @zext_i8_i16
; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
Expand Down Expand Up @@ -101,7 +100,6 @@ define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur
; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %conv = sext i8 %0 to i32
; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %conv = sext i8 %0 to i32
; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction: %conv = sext i8 %0 to i32

; CHECK-LABEL: define void @sext_i8_i16
; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
Expand Down

0 comments on commit 69ee653

Please sign in to comment.