Skip to content

Commit

Permalink
[SLP]Generalize cost model.
Browse files Browse the repository at this point in the history
Generalized the cost model estimation. Improved cost model estimation
for repeated scalars (no need to count their cost anymore), improved
  cost model for extractelement instructions.

cpu2017
   511.povray_r             0.57
   520.omnetpp_r           -0.98
   521.wrf_r               -0.01
   525.x264_r               3.59 <+
   526.blender_r           -0.12
   531.deepsjeng_r         -0.07
   538.imagick_r           -1.42
Geometric mean:  0.21

Differential Revision: https://reviews.llvm.org/D115757
  • Loading branch information
alexey-bataev committed Oct 18, 2022
1 parent 743087f commit f12fb91
Show file tree
Hide file tree
Showing 7 changed files with 402 additions and 373 deletions.
639 changes: 322 additions & 317 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -slp-threshold=-5 -S -pass-remarks-output=%t < %s | FileCheck %s
; RUN: opt -passes=slp-vectorizer -slp-threshold=-3 -S -pass-remarks-output=%t < %s | FileCheck %s
; RUN: cat %t | FileCheck -check-prefix=YAML %s


Expand Down
Expand Up @@ -107,25 +107,14 @@ entry:
define void @select_ule_ugt_mix_4xi32(i32* %ptr, i32 %x) {
; CHECK-LABEL: @select_ule_ugt_mix_4xi32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[L_0:%.*]] = load i32, i32* [[PTR:%.*]], align 4
; CHECK-NEXT: [[CMP_0:%.*]] = icmp ult i32 [[L_0]], 16383
; CHECK-NEXT: [[S_0:%.*]] = select i1 [[CMP_0]], i32 [[L_0]], i32 16383
; CHECK-NEXT: store i32 [[S_0]], i32* [[PTR]], align 4
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 1
; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[GEP_1]], align 4
; CHECK-NEXT: [[CMP_1:%.*]] = icmp ugt i32 [[L_1]], 16383
; CHECK-NEXT: [[S_1:%.*]] = select i1 [[CMP_1]], i32 [[L_1]], i32 16383
; CHECK-NEXT: store i32 [[S_1]], i32* [[GEP_1]], align 4
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 2
; CHECK-NEXT: [[L_2:%.*]] = load i32, i32* [[GEP_2]], align 4
; CHECK-NEXT: [[CMP_2:%.*]] = icmp ult i32 [[L_2]], 16383
; CHECK-NEXT: [[S_2:%.*]] = select i1 [[CMP_2]], i32 [[L_2]], i32 16383
; CHECK-NEXT: store i32 [[S_2]], i32* [[GEP_2]], align 4
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 3
; CHECK-NEXT: [[L_3:%.*]] = load i32, i32* [[GEP_3]], align 4
; CHECK-NEXT: [[CMP_3:%.*]] = icmp ugt i32 [[L_3]], 16383
; CHECK-NEXT: [[S_3:%.*]] = select i1 [[CMP_3]], i32 [[L_3]], i32 16383
; CHECK-NEXT: store i32 [[S_3]], i32* [[GEP_3]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[PTR:%.*]] to <4 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[TMP1]], <i32 16383, i32 16383, i32 16383, i32 16383>
; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i32> [[TMP1]], <i32 16383, i32 16383, i32 16383, i32 16383>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> <i32 16383, i32 16383, i32 16383, i32 16383>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[PTR]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
; CHECK-NEXT: ret void
;
entry:
Expand Down
18 changes: 11 additions & 7 deletions llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
Expand Up @@ -53,13 +53,17 @@ define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) {

define void @i64_simplifiedi_extract(i64* noalias %st, i64* noalias %ld) {
; CHECK-LABEL: @i64_simplifiedi_extract(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD:%.*]] to <2 x i64>*
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST:%.*]] to <4 x i64>*
; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 3
; CHECK-NEXT: store i64 [[TMP4]], i64* [[LD]], align 8
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[LD]], align 8
; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
; CHECK-NEXT: store i64 [[T0]], i64* [[ST]], align 8
; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX3]], align 8
; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX4]], align 8
; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX5]], align 8
; CHECK-NEXT: store i64 [[T1]], i64* [[LD]], align 8
; CHECK-NEXT: ret void
;
%arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
Expand Down
23 changes: 16 additions & 7 deletions llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
Expand Up @@ -8,13 +8,22 @@ define i64 @test() {
; CHECK: bb2:
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ poison, [[BB2:%.*]] ], [ zeroinitializer, [[BB1:%.*]] ]
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[SHUFFLE]])
; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[TMP65:%.*]] = sext i32 [[OP_RDX]] to i64
; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ 0, [[BB2:%.*]] ], [ 0, [[BB1:%.*]] ]
; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ 0, [[BB2]] ], [ 0, [[BB1]] ]
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP4]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP4]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP4]], i32 3
; CHECK-NEXT: [[TMP44:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP4]], i32 4
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP44]], i32 [[TMP4]], i32 5
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 6
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP4]], i32 7
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP7]])
; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP8]], [[TMP4]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[TMP4]], [[TMP4]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[OP_RDX]], [[OP_RDX1]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[OP_RDX2]], [[TMP]]
; CHECK-NEXT: [[TMP65:%.*]] = sext i32 [[OP_RDX3]] to i64
; CHECK-NEXT: ret i64 [[TMP65]]
;
bb1:
Expand Down
26 changes: 18 additions & 8 deletions llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
Expand Up @@ -6,14 +6,24 @@
define i16 @D134605() {
; CHECK-LABEL: @D134605(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr poison, align 1
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <8 x i32> <i32 2, i32 2, i32 1, i32 1, i32 0, i32 0, i32 3, i32 3>
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[SHUFFLE]], i32 6
; CHECK-NEXT: [[REASS_ADD:%.*]] = add i16 poison, [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[SHUFFLE]])
; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 [[TMP2]], poison
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i16 [[OP_RDX]], poison
; CHECK-NEXT: [[REASS_MUL24:%.*]] = shl i16 [[OP_RDX1]], 2
; CHECK-NEXT: [[ARRAYIDX81:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 3
; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX81]], align 1
; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr poison, align 1
; CHECK-NEXT: [[ARRAYIDX101:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 1
; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX101]], align 1
; CHECK-NEXT: [[ARRAYIDX107:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 2
; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX107]], align 1
; CHECK-NEXT: [[REASS_ADD:%.*]] = add i16 poison, [[TMP0]]
; CHECK-NEXT: [[ADD116:%.*]] = add i16 [[TMP1]], [[TMP0]]
; CHECK-NEXT: [[ADD122:%.*]] = add i16 [[ADD116]], [[TMP2]]
; CHECK-NEXT: [[ADD124:%.*]] = add i16 [[ADD122]], [[TMP3]]
; CHECK-NEXT: [[ADD125:%.*]] = add i16 [[ADD124]], poison
; CHECK-NEXT: [[FACTOR2531:%.*]] = add i16 [[TMP3]], [[ADD125]]
; CHECK-NEXT: [[ADD14332:%.*]] = add i16 [[FACTOR2531]], [[TMP2]]
; CHECK-NEXT: [[ADD14933:%.*]] = add i16 [[ADD14332]], [[TMP1]]
; CHECK-NEXT: [[ADD15534:%.*]] = add i16 [[ADD14933]], [[TMP0]]
; CHECK-NEXT: [[ADD15935:%.*]] = add i16 [[ADD15534]], poison
; CHECK-NEXT: [[REASS_MUL24:%.*]] = shl i16 [[ADD15935]], 2
; CHECK-NEXT: [[CALL:%.*]] = call i16 @check_i16(i16 noundef 1, i16 noundef [[REASS_MUL24]], i16 noundef 5120)
; CHECK-NEXT: unreachable
;
Expand Down
40 changes: 26 additions & 14 deletions llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
Expand Up @@ -4,20 +4,32 @@
define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR:%.*]] to <2 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
; CHECK-NEXT: ret i32 [[TMP11]]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[A1:%.*]]
; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP0]], [[A2:%.*]]
; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[TMP0]], [[A3:%.*]]
; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[TMP0]], [[A4:%.*]]
; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[TMP0]], [[A5:%.*]]
; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[TMP0]], [[A6:%.*]]
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARR]], align 4
; CHECK-NEXT: [[ADD12:%.*]] = add i32 [[TMP1]], [[A7:%.*]]
; CHECK-NEXT: [[ADD14:%.*]] = add i32 [[TMP1]], [[A8:%.*]]
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], [[ADD2]]
; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 [[ADD2]]
; CHECK-NEXT: [[CMP15:%.*]] = icmp ult i32 [[COND]], [[ADD4]]
; CHECK-NEXT: [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 [[ADD4]]
; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[COND19]], [[ADD6]]
; CHECK-NEXT: [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 [[ADD6]]
; CHECK-NEXT: [[CMP25:%.*]] = icmp ult i32 [[COND24]], [[ADD8]]
; CHECK-NEXT: [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 [[ADD8]]
; CHECK-NEXT: [[CMP30:%.*]] = icmp ult i32 [[COND29]], [[ADD10]]
; CHECK-NEXT: [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 [[ADD10]]
; CHECK-NEXT: [[CMP35:%.*]] = icmp ult i32 [[COND34]], [[ADD12]]
; CHECK-NEXT: [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 [[ADD12]]
; CHECK-NEXT: [[CMP40:%.*]] = icmp ult i32 [[COND39]], [[ADD14]]
; CHECK-NEXT: [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 [[ADD14]]
; CHECK-NEXT: ret i32 [[COND44]]
;
entry:
%arrayidx = getelementptr inbounds i32, i32* %arr, i64 1
Expand Down

0 comments on commit f12fb91

Please sign in to comment.