Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ target triple = "aarch64--linux-gnu"
; %var4 a lower scalarization overhead.
;
; COST-LABEL: predicated_udiv_scalarized_operand
; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
; COST: LV: Found an estimated cost of 5 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
;
;
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ target triple = "aarch64--linux-gnu"
%pair = type { i8, i8 }

; CHECK-LABEL: test
; CHECK: Found an estimated cost of 14 for VF 2 For instruction: {{.*}} load i8
; CHECK: Found an estimated cost of 16 for VF 2 For instruction: {{.*}} load i8
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8
; CHECK-LABEL: entry:
; CHECK-LABEL: vector.body:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,10 @@ entry:
; gaps.
;
; VF_2-LABEL: Checking a loop in 'i64_factor_8'
; VF_2: Found an estimated cost of 10 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
; VF_2: Found an estimated cost of 16 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8
; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8
; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8
; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i64.8, ptr %data, i64 %i, i32 2
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
target triple = "aarch64-unknown-linux-gnu"

; CHECK-COST: Checking a loop in 'fixed_width'
; CHECK-COST: Found an estimated cost of 11 for VF 2 For instruction: store i32 2, ptr %arrayidx1, align 4
; CHECK-COST: Found an estimated cost of 25 for VF 4 For instruction: store i32 2, ptr %arrayidx1, align 4
; CHECK-COST: Found an estimated cost of 12 for VF 2 For instruction: store i32 2, ptr %arrayidx1, align 4
; CHECK-COST: Found an estimated cost of 24 for VF 4 For instruction: store i32 2, ptr %arrayidx1, align 4
; CHECK-COST: Selecting VF: 1.

; We should decide this loop is not worth vectorising using fixed width vectors
Expand Down
34 changes: 17 additions & 17 deletions llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ target triple = "aarch64--linux-gnu"
; as:
;
; Cost of udiv:
; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
; (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
;
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
;
define i32 @predicated_udiv(ptr %a, ptr %b, i1 %c, i64 %n) {
entry:
Expand Down Expand Up @@ -57,10 +57,10 @@ for.end:
; as:
;
; Cost of store:
; (store(4) + extractelement(3)) / 2 = 3
; (store(4) + extractelement(4)) / 2 = 4
;
; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
;
define void @predicated_store(ptr %a, i1 %c, i32 %x, i64 %n) {
entry:
Expand Down Expand Up @@ -94,7 +94,7 @@ for.end:
; CHECK: Found scalar instruction: %addr.next = getelementptr inbounds i32, ptr %addr, i64 1
; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %addr, align 4
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ]
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4
; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4
;
define void @predicated_store_phi(ptr %a, i1 %c, i32 %x, i64 %n) {
entry:
Expand Down Expand Up @@ -129,14 +129,14 @@ for.end:
; compute the cost as:
;
; Cost of add:
; (add(2) + extractelement(3)) / 2 = 2
; (add(2) + extractelement(4)) / 2 = 3
; Cost of udiv:
; (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4
; (udiv(2) + extractelement(4) + insertelement(4)) / 2 = 5
;
; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
;
define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
entry:
Expand Down Expand Up @@ -174,13 +174,13 @@ for.end:
; compute the cost as:
;
; Cost of add:
; (add(2) + extractelement(3)) / 2 = 2
; (add(2) + extractelement(4)) / 2 = 3
; Cost of store:
; store(4) / 2 = 2
;
; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
;
define void @predicated_store_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
Expand Down Expand Up @@ -219,11 +219,11 @@ for.end:
; Cost of add:
; add(1) = 1
; Cost of sdiv:
; (sdiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
; (sdiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
; Cost of udiv:
; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
; (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
; Cost of sub:
; (sub(2) + extractelement(3)) / 2 = 2
; (sub(2) + extractelement(4)) / 2 = 3
; Cost of store:
; store(4) / 2 = 2
;
Expand All @@ -233,9 +233,9 @@ for.end:
; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x
; CHECK: Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, ptr %tmp0, align 4
;
define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) {
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

target triple="aarch64-unknown-linux-gnu"

; CHECK-VF4: Found an estimated cost of 17 for VF 4 For instruction: %add = fadd float %0, %sum.07
; CHECK-VF8: Found an estimated cost of 34 for VF 8 For instruction: %add = fadd float %0, %sum.07
; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction: %add = fadd float %0, %sum.07
; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction: %add = fadd float %0, %sum.07

define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) {
entry:
Expand All @@ -28,8 +28,8 @@ for.end:
}


; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction: %add = fadd double %0, %sum.07
; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction: %add = fadd double %0, %sum.07
; CHECK-VF4: Found an estimated cost of 12 for VF 4 For instruction: %add = fadd double %0, %sum.07
; CHECK-VF8: Found an estimated cost of 24 for VF 8 For instruction: %add = fadd double %0, %sum.07

define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) {
entry:
Expand All @@ -49,8 +49,8 @@ for.end:
ret double %add
}

; CHECK-VF4: Found an estimated cost of 19 for VF 4 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
; CHECK-VF8: Found an estimated cost of 38 for VF 8 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
; CHECK-VF4: Found an estimated cost of 16 for VF 4 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
; CHECK-VF8: Found an estimated cost of 32 for VF 8 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)

define float @fmuladd_strict32(ptr %a, ptr %b, i64 %n) {
entry:
Expand All @@ -74,8 +74,8 @@ for.end:

declare float @llvm.fmuladd.f32(float, float, float)

; CHECK-VF4: Found an estimated cost of 18 for VF 4 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
; CHECK-VF8: Found an estimated cost of 36 for VF 8 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
; CHECK-VF4: Found an estimated cost of 16 for VF 4 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
; CHECK-VF8: Found an estimated cost of 32 for VF 8 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)

define double @fmuladd_strict64(ptr %a, ptr %b, i64 %n) {
entry:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
; REQUIRES: asserts
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s

; Specify a large unsafe vectorization factor of 32 that gets clamped to 16,
; then test an even smaller VF of 2 is selected based on the cost-model.
; Specify a large unsafe vectorization factor of 32 that gets clamped to 16.

; CHECK: LV: User VF=32 is unsafe, clamping to max safe VF=16.
; CHECK: remark: <unknown>:0:0: User-specified vectorization factor 32 is unsafe, clamping to maximum safe vectorization factor 16
; CHECK: LV: Selecting VF: 2.
; CHECK: LV: Selecting VF: 16.
; CHECK-LABEL: @test
; CHECK: <2 x i64>
; CHECK: <16 x i64>
define void @test(ptr nocapture %a, ptr nocapture readonly %b) {
entry:
br label %loop.header
Expand Down
95 changes: 10 additions & 85 deletions llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,46 +5,10 @@
define <1 x float> @dotproduct_float_v6(<6 x float> %a, <6 x float> %b) {
; CHECK-LABEL: @dotproduct_float_v6(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x float> [[A:%.*]], <6 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 2>
; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 3>
; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 4>
; CHECK-NEXT: [[SPLIT5:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> <i32 5>
; CHECK-NEXT: [[SPLIT6:%.*]] = shufflevector <6 x float> [[B:%.*]], <6 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <1 x float> [[SPLIT]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP0]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <1 x float> [[SPLIT1]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x float> poison, float [[TMP2]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT8]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK7]], <1 x float> [[SPLAT_SPLAT9]], <1 x float> [[TMP1]])
; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <1 x float> [[SPLIT2]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 2
; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x float> poison, float [[TMP4]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT11]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK10]], <1 x float> [[SPLAT_SPLAT12]], <1 x float> [[TMP3]])
; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <1 x float> [[SPLIT3]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 3
; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x float> poison, float [[TMP6]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT14]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK13]], <1 x float> [[SPLAT_SPLAT15]], <1 x float> [[TMP5]])
; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <1 x float> [[SPLIT4]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 4
; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x float> poison, float [[TMP8]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT17]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK16]], <1 x float> [[SPLAT_SPLAT18]], <1 x float> [[TMP7]])
; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <1 x float> [[SPLIT5]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 5
; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x float> poison, float [[TMP10]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT20]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK19]], <1 x float> [[SPLAT_SPLAT21]], <1 x float> [[TMP9]])
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x float> [[TMP11]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <1 x float> poison, <1 x float> [[TMP12]], <1 x i32> <i32 1>
; CHECK-NEXT: ret <1 x float> [[TMP13]]
; CHECK-NEXT: [[TMP0:%.*]] = fmul <6 x float> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v6f32(float 0.000000e+00, <6 x float> [[TMP0]])
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i64 0
; CHECK-NEXT: ret <1 x float> [[TMP2]]
;
entry:
%c = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v6f32.v6f32(<6 x float> %a, <6 x float> %b, i32 1, i32 6, i32 1)
Expand Down Expand Up @@ -175,51 +139,12 @@ declare <1 x double> @llvm.matrix.multiply.v1f64.v6f64.v6f64(<6 x double>, <6 x
define <1 x double> @intrinsic_column_major_load_dot_product_double_v6(ptr %lhs_address, ptr %rhs_address) {
; CHECK-LABEL: @intrinsic_column_major_load_dot_product_double_v6(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <1 x double>, ptr [[LHS_ADDRESS:%.*]], align 4
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 1
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <1 x double>, ptr [[VEC_GEP]], align 4
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 2
; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <1 x double>, ptr [[VEC_GEP2]], align 4
; CHECK-NEXT: [[VEC_GEP4:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 3
; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <1 x double>, ptr [[VEC_GEP4]], align 4
; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 4
; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <1 x double>, ptr [[VEC_GEP6]], align 4
; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 5
; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <1 x double>, ptr [[VEC_GEP8]], align 4
; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <6 x double>, ptr [[RHS_ADDRESS:%.*]], align 4
; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <1 x double> [[COL_LOAD]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
; CHECK-NEXT: [[BLOCK11:%.*]] = shufflevector <1 x double> [[COL_LOAD1]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT12]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK11]], <1 x double> [[SPLAT_SPLAT13]], <1 x double> [[TMP1]])
; CHECK-NEXT: [[BLOCK14:%.*]] = shufflevector <1 x double> [[COL_LOAD3]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 2
; CHECK-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT15]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK14]], <1 x double> [[SPLAT_SPLAT16]], <1 x double> [[TMP3]])
; CHECK-NEXT: [[BLOCK17:%.*]] = shufflevector <1 x double> [[COL_LOAD5]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 3
; CHECK-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x double> poison, double [[TMP6]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT18]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK17]], <1 x double> [[SPLAT_SPLAT19]], <1 x double> [[TMP5]])
; CHECK-NEXT: [[BLOCK20:%.*]] = shufflevector <1 x double> [[COL_LOAD7]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 4
; CHECK-NEXT: [[SPLAT_SPLATINSERT21:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT21]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK20]], <1 x double> [[SPLAT_SPLAT22]], <1 x double> [[TMP7]])
; CHECK-NEXT: [[BLOCK23:%.*]] = shufflevector <1 x double> [[COL_LOAD9]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 5
; CHECK-NEXT: [[SPLAT_SPLATINSERT24:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT24]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK23]], <1 x double> [[SPLAT_SPLAT25]], <1 x double> [[TMP9]])
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <1 x double> poison, <1 x double> [[TMP12]], <1 x i32> <i32 1>
; CHECK-NEXT: ret <1 x double> [[TMP13]]
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <6 x double>, ptr [[RHS_ADDRESS:%.*]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = load <6 x double>, ptr [[LHS_ADDRESS:%.*]], align 64
; CHECK-NEXT: [[TMP1:%.*]] = fmul <6 x double> [[TMP0]], [[COL_LOAD]]
; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v6f64(double 0.000000e+00, <6 x double> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
; CHECK-NEXT: ret <1 x double> [[TMP3]]
;
entry:
%lhs = tail call fast <6 x double> @llvm.matrix.column.major.load.v6f64.i64(ptr nonnull align 4 %lhs_address, i64 1, i1 false, i32 1, i32 6)
Expand Down
47 changes: 26 additions & 21 deletions llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,21 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
; CHECK-NEXT: [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP1]]
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
; CHECK-NEXT: [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[TMP0]] to <2 x i64>
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
; CHECK-NEXT: [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP3]]
; CHECK-NEXT: [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[S3]]
; CHECK-NEXT: [[LOAD3:%.*]] = load i64, ptr [[GEP3]], align 4
; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
; CHECK-NEXT: ret void
Expand Down Expand Up @@ -58,23 +61,25 @@ define void @test2(<4 x i16> %a, <4 x i16> %b, i64 %c0, i64 %c1, i64 %c2, i64 %c
; CHECK-NEXT: [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[C0:%.*]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP6]]
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64
; CHECK-NEXT: [[A0:%.*]] = add i64 [[S0]], [[C0:%.*]]
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[A0]]
; CHECK-NEXT: [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP7]]
; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
; CHECK-NEXT: [[S1:%.*]] = sext i32 [[E1]] to i64
; CHECK-NEXT: [[A1:%.*]] = add i64 [[S1]], [[C1:%.*]]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A1]]
; CHECK-NEXT: [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP8]]
; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64
; CHECK-NEXT: [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]]
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A2]]
; CHECK-NEXT: [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP9]]
; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64
; CHECK-NEXT: [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]]
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A3]]
; CHECK-NEXT: [[LOAD3:%.*]] = load i64, ptr [[GEP3]], align 4
; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
; CHECK-NEXT: ret void
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ target triple = "aarch64--linux-gnu"
; REMARK-LABEL: Function: gather_multiple_use
; REMARK: Args:
; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost '
; REMARK-NEXT: - Cost: '-7'
; REMARK-NEXT: - Cost: '-8'
;
; REMARK-NOT: Function: gather_load

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ target triple = "aarch64--linux-gnu"
; YAML-NEXT: Function: getelementptr_4x32
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'SLP vectorized with cost '
; YAML-NEXT: - Cost: '6'
; YAML-NEXT: - Cost: '4'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '3'

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
; YAML-NEXT: Function: foo
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'SLP vectorized with cost '
; YAML-NEXT: - Cost: '3'
; YAML-NEXT: - Cost: '2'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '2'

Expand All @@ -28,7 +28,7 @@
; YAML-NEXT: Function: foo
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'SLP vectorized with cost '
; YAML-NEXT: - Cost: '1'
; YAML-NEXT: - Cost: '2'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '9'

Expand Down
80 changes: 30 additions & 50 deletions llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,66 +11,46 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
define void @wrap_mul4(ptr nocapture %Out, ptr nocapture readonly %A, ptr nocapture readonly %B) {
; CHECK-LABEL: @wrap_mul4(
; CHECK-NEXT: [[TEMP:%.*]] = load double, ptr [[A:%.*]], align 8
; CHECK-NEXT: [[TEMP1:%.*]] = load double, ptr [[B:%.*]], align 8
; CHECK-NEXT: [[MUL_I:%.*]] = fmul double [[TEMP]], [[TEMP1]]
; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 0, i64 1
; CHECK-NEXT: [[TEMP2:%.*]] = load double, ptr [[ARRAYIDX5_I]], align 8
; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 0
; CHECK-NEXT: [[TEMP3:%.*]] = load double, ptr [[ARRAYIDX7_I]], align 8
; CHECK-NEXT: [[MUL8_I:%.*]] = fmul double [[TEMP2]], [[TEMP3]]
; CHECK-NEXT: [[ADD_I:%.*]] = fadd double [[MUL_I]], [[MUL8_I]]
; CHECK-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 1
; CHECK-NEXT: [[TEMP4:%.*]] = load double, ptr [[ARRAYIDX13_I]], align 8
; CHECK-NEXT: [[MUL14_I:%.*]] = fmul double [[TEMP]], [[TEMP4]]
; CHECK-NEXT: [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 1
; CHECK-NEXT: [[TEMP5:%.*]] = load double, ptr [[ARRAYIDX18_I]], align 8
; CHECK-NEXT: [[MUL19_I:%.*]] = fmul double [[TEMP2]], [[TEMP5]]
; CHECK-NEXT: [[ADD20_I:%.*]] = fadd double [[MUL14_I]], [[MUL19_I]]
; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B:%.*]], i64 1, i64 0
; CHECK-NEXT: [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 2
; CHECK-NEXT: [[TEMP6:%.*]] = load double, ptr [[ARRAYIDX25_I]], align 8
; CHECK-NEXT: [[MUL26_I:%.*]] = fmul double [[TEMP]], [[TEMP6]]
; CHECK-NEXT: [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 2
; CHECK-NEXT: [[TEMP7:%.*]] = load double, ptr [[ARRAYIDX30_I]], align 8
; CHECK-NEXT: [[MUL31_I:%.*]] = fmul double [[TEMP2]], [[TEMP7]]
; CHECK-NEXT: [[ADD32_I:%.*]] = fadd double [[MUL26_I]], [[MUL31_I]]
; CHECK-NEXT: [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 3
; CHECK-NEXT: [[TEMP8:%.*]] = load double, ptr [[ARRAYIDX37_I]], align 8
; CHECK-NEXT: [[MUL38_I:%.*]] = fmul double [[TEMP]], [[TEMP8]]
; CHECK-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 3
; CHECK-NEXT: [[TEMP9:%.*]] = load double, ptr [[ARRAYIDX42_I]], align 8
; CHECK-NEXT: [[MUL43_I:%.*]] = fmul double [[TEMP2]], [[TEMP9]]
; CHECK-NEXT: [[ADD44_I:%.*]] = fadd double [[MUL38_I]], [[MUL43_I]]
; CHECK-NEXT: [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 1, i64 0
; CHECK-NEXT: [[TEMP10:%.*]] = load double, ptr [[ARRAYIDX47_I]], align 8
; CHECK-NEXT: [[MUL50_I:%.*]] = fmul double [[TEMP1]], [[TEMP10]]
; CHECK-NEXT: [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 1, i64 1
; CHECK-NEXT: [[TEMP11:%.*]] = load double, ptr [[ARRAYIDX52_I]], align 8
; CHECK-NEXT: [[MUL55_I:%.*]] = fmul double [[TEMP3]], [[TEMP11]]
; CHECK-NEXT: [[ADD56_I:%.*]] = fadd double [[MUL50_I]], [[MUL55_I]]
; CHECK-NEXT: [[MUL62_I:%.*]] = fmul double [[TEMP4]], [[TEMP10]]
; CHECK-NEXT: [[MUL67_I:%.*]] = fmul double [[TEMP5]], [[TEMP11]]
; CHECK-NEXT: [[ADD68_I:%.*]] = fadd double [[MUL62_I]], [[MUL67_I]]
; CHECK-NEXT: [[MUL74_I:%.*]] = fmul double [[TEMP6]], [[TEMP10]]
; CHECK-NEXT: [[MUL79_I:%.*]] = fmul double [[TEMP7]], [[TEMP11]]
; CHECK-NEXT: [[ADD80_I:%.*]] = fadd double [[MUL74_I]], [[MUL79_I]]
; CHECK-NEXT: [[MUL86_I:%.*]] = fmul double [[TEMP8]], [[TEMP10]]
; CHECK-NEXT: [[MUL91_I:%.*]] = fmul double [[TEMP9]], [[TEMP11]]
; CHECK-NEXT: [[ADD92_I:%.*]] = fadd double [[MUL86_I]], [[MUL91_I]]
; CHECK-NEXT: store double [[ADD_I]], ptr [[OUT:%.*]], align 8
; CHECK-NEXT: [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 1
; CHECK-NEXT: store double [[ADD20_I]], ptr [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2]], align 8
; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 2
; CHECK-NEXT: store double [[ADD32_I]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8
; CHECK-NEXT: [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 3
; CHECK-NEXT: store double [[ADD44_I]], ptr [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX7_I]], align 8
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP5]]
; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]]
; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 2
; CHECK-NEXT: [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX25_I]], align 8
; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x double> [[TMP3]], [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr [[ARRAYIDX30_I]], align 8
; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x double> [[TMP7]], [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = fadd <2 x double> [[TMP11]], [[TMP13]]
; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[OUT]], align 8
; CHECK-NEXT: store <2 x double> [[TMP14]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8
; CHECK-NEXT: [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 4
; CHECK-NEXT: store double [[ADD56_I]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8
; CHECK-NEXT: [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 5
; CHECK-NEXT: store double [[ADD68_I]], ptr [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10]], align 8
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x double> [[TMP1]], [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> [[TMP18]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP5]], [[TMP19]]
; CHECK-NEXT: [[TMP21:%.*]] = fadd <2 x double> [[TMP17]], [[TMP20]]
; CHECK-NEXT: store <2 x double> [[TMP21]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8
; CHECK-NEXT: [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 6
; CHECK-NEXT: store double [[ADD80_I]], ptr [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]], align 8
; CHECK-NEXT: [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 7
; CHECK-NEXT: store double [[ADD92_I]], ptr [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14]], align 8
; CHECK-NEXT: [[TMP22:%.*]] = fmul <2 x double> [[TMP10]], [[TMP16]]
; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[TMP12]], [[TMP19]]
; CHECK-NEXT: [[TMP24:%.*]] = fadd <2 x double> [[TMP22]], [[TMP23]]
; CHECK-NEXT: store <2 x double> [[TMP24]], ptr [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]], align 8
; CHECK-NEXT: ret void
;
%temp = load double, ptr %A, align 8
Expand Down
33 changes: 15 additions & 18 deletions llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
Original file line number Diff line number Diff line change
Expand Up @@ -654,18 +654,17 @@ bb23:
define void @single_membound(ptr %arg, ptr %arg1, double %x) {
; CHECK-LABEL: @single_membound(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
; CHECK-NEXT: [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01
; CHECK-NEXT: store double [[TMP]], ptr [[TMP9]], align 8
; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr [[ARG1:%.*]], align 8
; CHECK-NEXT: [[TMP13:%.*]] = fsub double 1.000000e+00, [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 2
; CHECK-NEXT: br label [[BB15:%.*]]
; CHECK: bb15:
; CHECK-NEXT: [[TMP16:%.*]] = fmul double [[TMP]], 2.000000e+01
; CHECK-NEXT: store double [[TMP16]], ptr [[TMP9]], align 8
; CHECK-NEXT: [[TMP17:%.*]] = fmul double [[TMP13]], 3.000000e+01
; CHECK-NEXT: store double [[TMP17]], ptr [[TMP14]], align 8
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP13]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 2.000000e+01, double 3.000000e+01>
; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[TMP9]], align 8
; CHECK-NEXT: ret void
;
entry:
Expand Down Expand Up @@ -1232,27 +1231,25 @@ define void @crash_no_tracked_instructions(ptr %arg, ptr %arg.2, ptr %arg.3, i1
; CHECK-NEXT: entry:
; CHECK-NEXT: [[T19:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
; CHECK-NEXT: [[T20:%.*]] = load float, ptr [[ARG_3:%.*]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[T20]], i32 1
; CHECK-NEXT: br i1 [[C:%.*]], label [[BB22:%.*]], label [[BB30:%.*]]
; CHECK: bb22:
; CHECK-NEXT: [[T23:%.*]] = fmul float [[T20]], 9.900000e+01
; CHECK-NEXT: [[T24:%.*]] = fmul float [[T23]], 9.900000e+01
; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds float, ptr [[T19]], i64 2
; CHECK-NEXT: [[T26:%.*]] = fmul float [[T23]], 1.000000e+01
; CHECK-NEXT: store float [[T26]], ptr [[T25]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[T23]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], <float 9.900000e+01, float 1.000000e+01>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
; CHECK-NEXT: store float [[TMP4]], ptr [[T25]], align 4
; CHECK-NEXT: [[T27:%.*]] = load float, ptr [[ARG_2:%.*]], align 8
; CHECK-NEXT: [[T28:%.*]] = fadd float [[T24]], 2.000000e+01
; CHECK-NEXT: [[T29:%.*]] = fadd float [[T26]], 2.000000e+01
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP3]], <float 2.000000e+01, float 2.000000e+01>
; CHECK-NEXT: br label [[BB30]]
; CHECK: bb30:
; CHECK-NEXT: [[T31:%.*]] = phi float [ [[T28]], [[BB22]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[T32:%.*]] = phi float [ [[T29]], [[BB22]] ], [ [[T20]], [[ENTRY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[BB36:%.*]]
; CHECK: bb36:
; CHECK-NEXT: [[T37:%.*]] = fmul float [[T31]], 3.000000e+00
; CHECK-NEXT: store float [[T37]], ptr [[ARG_3]], align 4
; CHECK-NEXT: [[T39:%.*]] = fmul float [[T32]], 3.000000e+00
; CHECK-NEXT: [[T40:%.*]] = getelementptr inbounds float, ptr [[ARG_3]], i64 1
; CHECK-NEXT: store float [[T39]], ptr [[T40]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], <float 3.000000e+00, float 3.000000e+00>
; CHECK-NEXT: store <2 x float> [[TMP7]], ptr [[ARG_3]], align 4
; CHECK-NEXT: br label [[BB41:%.*]]
; CHECK: bb41:
; CHECK-NEXT: ret void
Expand Down
639 changes: 165 additions & 474 deletions llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
Original file line number Diff line number Diff line change
Expand Up @@ -206,22 +206,22 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
define float @slp_not_profitable_in_loop(float %x, ptr %A) {
; CHECK-LABEL: @slp_not_profitable_in_loop(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_A_1]], align 4
; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2
; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4
; CHECK-NEXT: [[L_2:%.*]] = load float, ptr [[A]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[A]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[MUL11:%.*]] = fmul fast float 3.000000e+00, [[L_0]]
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]]
; CHECK-NEXT: [[MUL14:%.*]] = fmul fast float [[X:%.*]], [[L_2]]
; CHECK-NEXT: [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[MUL11]]
; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[MUL14]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
; CHECK-NEXT: [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -505,9 +505,9 @@ define i31 @load_with_non_power_of_2_element_type(ptr %x) {
define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
; CHECK-LABEL: @load_multiple_extracts_with_constant_idx(
; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]]
; CHECK-NEXT: [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[E_0:%.*]] = extractelement <4 x i32> [[LV]], i32 0
; CHECK-NEXT: [[E_1:%.*]] = extractelement <4 x i32> [[LV]], i32 1
; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
; CHECK-NEXT: ret i32 [[RES]]
;
%lv = load <4 x i32>, ptr %x
Expand All @@ -521,10 +521,9 @@ define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
; because the vector large vector requires 2 vector registers.
define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) {
; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable(
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i32>, ptr [[X:%.*]], i32 0, i32 0
; CHECK-NEXT: [[E_0:%.*]] = load i32, ptr [[TMP1]], align 16
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x i32>, ptr [[X]], i32 0, i32 6
; CHECK-NEXT: [[E_1:%.*]] = load i32, ptr [[TMP2]], align 8
; CHECK-NEXT: [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16
; CHECK-NEXT: [[E_0:%.*]] = extractelement <8 x i32> [[LV]], i32 0
; CHECK-NEXT: [[E_1:%.*]] = extractelement <8 x i32> [[LV]], i32 6
; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
; CHECK-NEXT: ret i32 [[RES]]
;
Expand Down