247 changes: 173 additions & 74 deletions llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
Original file line number Diff line number Diff line change
@@ -1,32 +1,85 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s

define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) {
; CHECK-LABEL: @v3_load_i32_mul_by_constant_store(
; NON-POW2-LABEL: @v3_load_i32_mul_by_constant_store(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <3 x i32> [[TMP0]], <i32 10, i32 10, i32 10>
; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @v3_load_i32_mul_by_constant_store(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_0]], align 4
; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 10, i32 10>
; POW2-ONLY-NEXT: store <2 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
%l.src.0 = load i32, ptr %gep.src.0, align 4
%mul.0 = mul nsw i32 %l.src.0, 10

%gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
%l.src.1 = load i32, ptr %gep.src.1, align 4
%mul.1 = mul nsw i32 %l.src.1, 10

%gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
%l.src.2 = load i32, ptr %gep.src.2, align 4
%mul.2 = mul nsw i32 %l.src.2, 10

store i32 %mul.0, ptr %dst

%dst.1 = getelementptr i32, ptr %dst, i32 1
store i32 %mul.1, ptr %dst.1

%dst.2 = getelementptr i32, ptr %dst, i32 2
store i32 %mul.2, ptr %dst.2

ret void
}

; Should no be vectorized with a undef/poison element as padding, as division by undef/poison may cause UB.
define void @v3_load_i32_udiv_by_constant_store(ptr %src, ptr %dst) {
; CHECK-LABEL: @v3_load_i32_udiv_by_constant_store(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
; CHECK-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
; CHECK-NEXT: [[MUL_0:%.*]] = udiv i32 10, [[L_SRC_0]]
; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
; CHECK-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
; CHECK-NEXT: [[MUL_1:%.*]] = udiv i32 10, [[L_SRC_1]]
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_0]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 10, i32 10>
; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: [[MUL_2:%.*]] = udiv i32 10, [[L_SRC_2]]
; CHECK-NEXT: store i32 [[MUL_0]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
; CHECK-NEXT: store i32 [[MUL_1]], ptr [[DST_1]], align 4
; CHECK-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
; CHECK-NEXT: ret void
;
entry:
%gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
%l.src.0 = load i32, ptr %gep.src.0, align 4
%mul.0 = mul nsw i32 %l.src.0, 10
%mul.0 = udiv i32 10, %l.src.0

%gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
%l.src.1 = load i32, ptr %gep.src.1, align 4
%mul.1 = mul nsw i32 %l.src.1, 10
%mul.1 = udiv i32 10, %l.src.1

%gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
%l.src.2 = load i32, ptr %gep.src.2, align 4
%mul.2 = mul nsw i32 %l.src.2, 10
%mul.2 = udiv i32 10, %l.src.2

store i32 %mul.0, ptr %dst

Expand All @@ -39,23 +92,35 @@ entry:
ret void
}



define void @v3_load_i32_mul_store(ptr %src.1, ptr %src.2, ptr %dst) {
; CHECK-LABEL: @v3_load_i32_mul_store(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
; CHECK-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
; CHECK-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
; CHECK-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
; CHECK-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
; CHECK-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
; CHECK-NEXT: ret void
; NON-POW2-LABEL: @v3_load_i32_mul_store(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]]
; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @v3_load_i32_mul_store(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
Expand Down Expand Up @@ -88,24 +153,35 @@ entry:
}

define void @v3_load_i32_mul_add_const_store(ptr %src.1, ptr %src.2, ptr %dst) {
; CHECK-LABEL: @v3_load_i32_mul_add_const_store(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
; CHECK-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
; CHECK-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
; CHECK-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
; CHECK-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
; CHECK-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[MUL_2]], 9
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], <i32 9, i32 9>
; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
; CHECK-NEXT: store i32 [[ADD_2]], ptr [[DST_2]], align 4
; CHECK-NEXT: ret void
; NON-POW2-LABEL: @v3_load_i32_mul_add_const_store(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]]
; NON-POW2-NEXT: [[TMP3:%.*]] = add <3 x i32> [[TMP2]], <i32 9, i32 9, i32 9>
; NON-POW2-NEXT: store <3 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @v3_load_i32_mul_add_const_store(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
; POW2-ONLY-NEXT: [[ADD_2:%.*]] = add i32 [[MUL_2]], 9
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
; POW2-ONLY-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], <i32 9, i32 9>
; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
; POW2-ONLY-NEXT: store i32 [[ADD_2]], ptr [[DST_2]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
Expand Down Expand Up @@ -141,18 +217,26 @@ entry:
}

define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) {
; CHECK-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
; CHECK-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
; CHECK-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01>
; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
; CHECK-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4
; CHECK-NEXT: ret void
; NON-POW2-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>
; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
; POW2-ONLY-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01>
; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
; POW2-ONLY-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
Expand All @@ -179,18 +263,28 @@ entry:
}

define void @phi_store3(ptr %dst) {
; CHECK-LABEL: @phi_store3(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: invoke.cont8.loopexit:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ <i32 1, i32 2>, [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ]
; CHECK-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2
; CHECK-NEXT: store <2 x i32> [[TMP0]], ptr [[DST]], align 4
; CHECK-NEXT: store i32 [[P_2]], ptr [[DST_2]], align 4
; CHECK-NEXT: ret void
; NON-POW2-LABEL: @phi_store3(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: br label [[EXIT:%.*]]
; NON-POW2: invoke.cont8.loopexit:
; NON-POW2-NEXT: br label [[EXIT]]
; NON-POW2: exit:
; NON-POW2-NEXT: [[TMP0:%.*]] = phi <3 x i32> [ <i32 1, i32 2, i32 3>, [[ENTRY:%.*]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
; NON-POW2-NEXT: store <3 x i32> [[TMP0]], ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @phi_store3(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: br label [[EXIT:%.*]]
; POW2-ONLY: invoke.cont8.loopexit:
; POW2-ONLY-NEXT: br label [[EXIT]]
; POW2-ONLY: exit:
; POW2-ONLY-NEXT: [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
; POW2-ONLY-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ <i32 1, i32 2>, [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ]
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2
; POW2-ONLY-NEXT: store <2 x i32> [[TMP0]], ptr [[DST]], align 4
; POW2-ONLY-NEXT: store i32 [[P_2]], ptr [[DST_2]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
br label %exit
Expand All @@ -213,13 +307,18 @@ exit:
}

define void @store_try_reorder(ptr %dst) {
; CHECK-LABEL: @store_try_reorder(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ADD:%.*]] = add i32 0, 0
; CHECK-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
; CHECK-NEXT: ret void
; NON-POW2-LABEL: @store_try_reorder(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @store_try_reorder(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0
; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4
; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%add = add i32 0, 0
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s

define void @vec3_vectorize_call(ptr %Colour, float %0) {
; CHECK-LABEL: @vec3_vectorize_call(
Expand Down
207 changes: 154 additions & 53 deletions llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s

%struct.zot = type { i32, i32, i32 }

Expand Down Expand Up @@ -172,32 +173,49 @@ entry:
}

define i32 @reorder_indices_1(float %0) {
; CHECK-LABEL: define i32 @reorder_indices_1(
; CHECK-SAME: float [[TMP0:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
; CHECK-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
; CHECK-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
; CHECK-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
; CHECK-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
; CHECK-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
; CHECK-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
; CHECK-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
; CHECK-NEXT: ret i32 0
; NON-POW2-LABEL: define i32 @reorder_indices_1(
; NON-POW2-SAME: float [[TMP0:%.*]]) {
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]]
; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1
; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2
; NON-POW2-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]]
; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]])
; NON-POW2-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer)
; NON-POW2-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer
; NON-POW2-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4
; NON-POW2-NEXT: ret i32 0
;
; POW2-ONLY-LABEL: define i32 @reorder_indices_1(
; POW2-ONLY-SAME: float [[TMP0:%.*]]) {
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
; POW2-ONLY-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4
; POW2-ONLY-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; POW2-ONLY-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
; POW2-ONLY-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
; POW2-ONLY-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
; POW2-ONLY-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
; POW2-ONLY-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
; POW2-ONLY-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
; POW2-ONLY-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
; POW2-ONLY-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
; POW2-ONLY-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
; POW2-ONLY-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
; POW2-ONLY-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
; POW2-ONLY-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
; POW2-ONLY-NEXT: ret i32 0
;
entry:
%nor1 = alloca [0 x [3 x float]], i32 0, align 4
Expand Down Expand Up @@ -228,19 +246,34 @@ entry:
}

define void @reorder_indices_2(ptr %spoint) {
; CHECK-LABEL: define void @reorder_indices_2(
; CHECK-SAME: ptr [[SPOINT:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
; CHECK-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
; CHECK-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00
; CHECK-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer
; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4
; CHECK-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2
; CHECK-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4
; CHECK-NEXT: ret void
; NON-POW2-LABEL: define void @reorder_indices_2(
; NON-POW2-SAME: ptr [[SPOINT:%.*]]) {
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1
; NON-POW2-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2
; NON-POW2-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1
; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2
; NON-POW2-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
; NON-POW2-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer
; NON-POW2-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: define void @reorder_indices_2(
; POW2-ONLY-SAME: ptr [[SPOINT:%.*]]) {
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
; POW2-ONLY-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
; POW2-ONLY-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00
; POW2-ONLY-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
; POW2-ONLY-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer
; POW2-ONLY-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4
; POW2-ONLY-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2
; POW2-ONLY-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%0 = extractelement <3 x float> zeroinitializer, i64 1
Expand Down Expand Up @@ -292,19 +325,30 @@ entry:
}

define void @reuse_shuffle_indidces_1(ptr %col, float %0, float %1) {
; CHECK-LABEL: define void @reuse_shuffle_indidces_1(
; CHECK-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer
; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4
; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2
; CHECK-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00
; CHECK-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00
; CHECK-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4
; CHECK-NEXT: ret void
; NON-POW2-LABEL: define void @reuse_shuffle_indidces_1(
; NON-POW2-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1
; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2
; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer
; NON-POW2-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer
; NON-POW2-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: define void @reuse_shuffle_indidces_1(
; POW2-ONLY-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1
; POW2-ONLY-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
; POW2-ONLY-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer
; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4
; POW2-ONLY-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2
; POW2-ONLY-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00
; POW2-ONLY-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00
; POW2-ONLY-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%mul24 = fmul float %1, 0.000000e+00
Expand Down Expand Up @@ -513,4 +557,61 @@ entry:
ret void
}

define void @can_reorder_vec3_op_with_padding(ptr %A, <3 x float> %in) {
; NON-POW2-LABEL: define void @can_reorder_vec3_op_with_padding(
; NON-POW2-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) {
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0
; NON-POW2-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[IN]], i64 1
; NON-POW2-NEXT: [[TMP2:%.*]] = extractelement <3 x float> [[IN]], i64 2
; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP2]], i32 1
; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 2
; NON-POW2-NEXT: [[TMP6:%.*]] = fsub <3 x float> [[TMP5]], [[TMP5]]
; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <3 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>)
; NON-POW2-NEXT: [[TMP8:%.*]] = fmul <3 x float> [[TMP7]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
; NON-POW2-NEXT: store <3 x float> [[TMP8]], ptr [[A]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: define void @can_reorder_vec3_op_with_padding(
; POW2-ONLY-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) {
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr float, ptr [[A]], i64 2
; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0
; POW2-ONLY-NEXT: [[SUB_I362:%.*]] = fsub float [[TMP0]], [[TMP0]]
; POW2-ONLY-NEXT: [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[SUB_I362]], float 2.000000e+00, float 3.000000e+00)
; POW2-ONLY-NEXT: [[MUL6_I_I_I_I:%.*]] = fmul float [[TMP1]], 3.000000e+00
; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP2]], [[TMP2]]
; POW2-ONLY-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 3.000000e+00, float 3.000000e+00>)
; POW2-ONLY-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], <float 3.000000e+00, float 3.000000e+00>
; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[A]], align 4
; POW2-ONLY-NEXT: store float [[MUL6_I_I_I_I]], ptr [[ARRAYIDX42_I]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%arrayidx42.i = getelementptr float, ptr %A, i64 2
%arrayidx35.i = getelementptr float, ptr %A, i64 1
%0 = extractelement <3 x float> %in, i64 0
%1 = extractelement <3 x float> %in, i64 0
%sub.i362 = fsub float %0, %1
%2 = extractelement <3 x float> %in, i64 1
%3 = extractelement <3 x float> %in, i64 1
%sub5.i = fsub float %2, %3
%4 = extractelement <3 x float> %in, i64 2
%5 = extractelement <3 x float> %in, i64 2
%sub9.i = fsub float %4, %5
%6 = call float @llvm.fmuladd.f32(float %sub5.i, float 2.000000e+00, float 3.000000e+00)
%7 = call float @llvm.fmuladd.f32(float %sub9.i, float 2.000000e+00, float 3.000000e+00)
%8 = call float @llvm.fmuladd.f32(float %sub.i362, float 2.000000e+00, float 3.000000e+00)
%mul.i.i.i.i373 = fmul float %6, 3.000000e+00
%mul3.i.i.i.i = fmul float %7, 3.000000e+00
%mul6.i.i.i.i = fmul float %8, 3.000000e+00
store float %mul.i.i.i.i373, ptr %A, align 4
store float %mul3.i.i.i.i, ptr %arrayidx35.i, align 4
store float %mul6.i.i.i.i, ptr %arrayidx42.i, align 4
ret void
}

declare float @llvm.fmuladd.f32(float, float, float)
declare double @llvm.fmuladd.f64(double, double, double)
66 changes: 40 additions & 26 deletions llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=slp-vectorizer,dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
; RUN: opt < %s -passes=slp-vectorizer,dce -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck --check-prefixes=CHECK,NON-POW2 %s
; RUN: opt < %s -passes=slp-vectorizer,dce -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck --check-prefixes=CHECK,POW2-ONLY %s

;int foo(char * restrict A, ptr restrict B, float T) {
; A[0] = (T * B[10] + 4.0);
Expand All @@ -8,31 +9,44 @@
;}

define i32 @foo(ptr noalias nocapture %A, ptr noalias nocapture %B, float %T) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]]
; CHECK-NEXT: [[TMP4:%.*]] = fpext float [[TMP3]] to double
; CHECK-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00
; CHECK-NEXT: [[TMP6:%.*]] = fptosi double [[TMP5]] to i8
; CHECK-NEXT: store i8 [[TMP6]], ptr [[A:%.*]], align 1
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11
; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
; CHECK-NEXT: [[TMP9:%.*]] = fmul float [[TMP8]], [[T]]
; CHECK-NEXT: [[TMP10:%.*]] = fpext float [[TMP9]] to double
; CHECK-NEXT: [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00
; CHECK-NEXT: [[TMP12:%.*]] = fptosi double [[TMP11]] to i8
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
; CHECK-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12
; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4
; CHECK-NEXT: [[TMP16:%.*]] = fmul float [[TMP15]], [[T]]
; CHECK-NEXT: [[TMP17:%.*]] = fpext float [[TMP16]] to double
; CHECK-NEXT: [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00
; CHECK-NEXT: [[TMP19:%.*]] = fptosi double [[TMP18]] to i8
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
; CHECK-NEXT: store i8 [[TMP19]], ptr [[TMP20]], align 1
; CHECK-NEXT: ret i32 undef
; NON-POW2-LABEL: @foo(
; NON-POW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP1]], align 4
; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[T:%.*]], i32 0
; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[T]], i32 1
; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[T]], i32 2
; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP2]], [[TMP5]]
; NON-POW2-NEXT: [[TMP7:%.*]] = fpext <3 x float> [[TMP6]] to <3 x double>
; NON-POW2-NEXT: [[TMP8:%.*]] = fadd <3 x double> [[TMP7]], <double 4.000000e+00, double 5.000000e+00, double 6.000000e+00>
; NON-POW2-NEXT: [[TMP9:%.*]] = fptosi <3 x double> [[TMP8]] to <3 x i8>
; NON-POW2-NEXT: store <3 x i8> [[TMP9]], ptr [[A:%.*]], align 1
; NON-POW2-NEXT: ret i32 undef
;
; POW2-ONLY-LABEL: @foo(
; POW2-ONLY-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]]
; POW2-ONLY-NEXT: [[TMP4:%.*]] = fpext float [[TMP3]] to double
; POW2-ONLY-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00
; POW2-ONLY-NEXT: [[TMP6:%.*]] = fptosi double [[TMP5]] to i8
; POW2-ONLY-NEXT: store i8 [[TMP6]], ptr [[A:%.*]], align 1
; POW2-ONLY-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11
; POW2-ONLY-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
; POW2-ONLY-NEXT: [[TMP9:%.*]] = fmul float [[TMP8]], [[T]]
; POW2-ONLY-NEXT: [[TMP10:%.*]] = fpext float [[TMP9]] to double
; POW2-ONLY-NEXT: [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00
; POW2-ONLY-NEXT: [[TMP12:%.*]] = fptosi double [[TMP11]] to i8
; POW2-ONLY-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
; POW2-ONLY-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1
; POW2-ONLY-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12
; POW2-ONLY-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4
; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul float [[TMP15]], [[T]]
; POW2-ONLY-NEXT: [[TMP17:%.*]] = fpext float [[TMP16]] to double
; POW2-ONLY-NEXT: [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00
; POW2-ONLY-NEXT: [[TMP19:%.*]] = fptosi double [[TMP18]] to i8
; POW2-ONLY-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
; POW2-ONLY-NEXT: store i8 [[TMP19]], ptr [[TMP20]], align 1
; POW2-ONLY-NEXT: ret i32 undef
;
%1 = getelementptr inbounds float, ptr %B, i64 10
%2 = load float, ptr %1, align 4
Expand Down
343 changes: 194 additions & 149 deletions llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll

Large diffs are not rendered by default.