160 changes: 32 additions & 128 deletions llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll

Large diffs are not rendered by default.

37 changes: 29 additions & 8 deletions llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -598,14 +598,35 @@ define void @sitofp_8i8_8f64() #0 {
;

define void @sitofp_2i64_2f32() #0 {
; CHECK-LABEL: @sitofp_2i64_2f32(
; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
; CHECK-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
; CHECK-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
; CHECK-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
; CHECK-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
; CHECK-NEXT: ret void
; SSE-LABEL: @sitofp_2i64_2f32(
; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
; SSE-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
; SSE-NEXT: ret void
;
; AVX256NODQ-LABEL: @sitofp_2i64_2f32(
; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
; AVX256NODQ-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
; AVX256NODQ-NEXT: ret void
;
; AVX512-LABEL: @sitofp_2i64_2f32(
; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float>
; AVX512-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
; AVX512-NEXT: ret void
;
; AVX256DQ-LABEL: @sitofp_2i64_2f32(
; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float>
; AVX256DQ-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
; AVX256DQ-NEXT: ret void
;
%ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
%ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,13 @@ define void @tiny_tree_not_fully_vectorizable2(float* noalias nocapture %dst, fl
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1
; CHECK-NEXT: store float [[TMP1]], float* [[ARRAYIDX3]], align 4
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX5]], align 4
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
; CHECK-NEXT: store float [[TMP3]], float* [[ARRAYIDX7]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX5]] to <2 x float>*
; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]]
; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]]
; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1
Expand Down
37 changes: 29 additions & 8 deletions llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -550,14 +550,35 @@ define void @uitofp_8i8_8f64() #0 {
;

define void @uitofp_2i64_2f32() #0 {
; CHECK-LABEL: @uitofp_2i64_2f32(
; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
; CHECK-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
; CHECK-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
; CHECK-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
; CHECK-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
; CHECK-NEXT: ret void
; SSE-LABEL: @uitofp_2i64_2f32(
; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
; SSE-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
; SSE-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
; SSE-NEXT: ret void
;
; AVX256NODQ-LABEL: @uitofp_2i64_2f32(
; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
; AVX256NODQ-NEXT: ret void
;
; AVX512-LABEL: @uitofp_2i64_2f32(
; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float>
; AVX512-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
; AVX512-NEXT: ret void
;
; AVX256DQ-LABEL: @uitofp_2i64_2f32(
; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float>
; AVX256DQ-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
; AVX256DQ-NEXT: ret void
;
%ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
%ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
Expand Down
51 changes: 51 additions & 0 deletions llvm/test/Transforms/SLPVectorizer/X86/vec-reg-64bit.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -basicaa -slp-vectorizer -mcpu=btver2 -S | FileCheck %s --check-prefix=VECT
; RUN: opt < %s -basicaa -slp-vectorizer -mcpu=btver2 -slp-min-reg-size=128 -S | FileCheck %s --check-prefix=NOVECT

; Check SLPVectorizer works for packed horizontal 128-bit instrs.
; See llvm.org/PR32433

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @add_pairs_128(<4 x float>, float* nocapture) #0 {
; VECT-LABEL: @add_pairs_128(
; VECT-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP0:%.*]], i32 0
; VECT-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; VECT-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; VECT-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; VECT-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0
; VECT-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP5]], i32 1
; VECT-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[TMP4]], i32 0
; VECT-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP6]], i32 1
; VECT-NEXT: [[TMP11:%.*]] = fadd <2 x float> [[TMP8]], [[TMP10]]
; VECT-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 1
; VECT-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP1]] to <2 x float>*
; VECT-NEXT: store <2 x float> [[TMP11]], <2 x float>* [[TMP13]], align 4
; VECT-NEXT: ret void
;
; NOVECT-LABEL: @add_pairs_128(
; NOVECT-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP0:%.*]], i32 0
; NOVECT-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOVECT-NEXT: [[TMP5:%.*]] = fadd float [[TMP3]], [[TMP4]]
; NOVECT-NEXT: store float [[TMP5]], float* [[TMP1:%.*]], align 4
; NOVECT-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOVECT-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOVECT-NEXT: [[TMP8:%.*]] = fadd float [[TMP6]], [[TMP7]]
; NOVECT-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 1
; NOVECT-NEXT: store float [[TMP8]], float* [[TMP9]], align 4
; NOVECT-NEXT: ret void
;
%3 = extractelement <4 x float> %0, i32 0
%4 = extractelement <4 x float> %0, i32 1
%5 = fadd float %3, %4
store float %5, float* %1, align 4
%6 = extractelement <4 x float> %0, i32 2
%7 = extractelement <4 x float> %0, i32 3
%8 = fadd float %6, %7
%9 = getelementptr inbounds float, float* %1, i64 1
store float %8, float* %9, align 4
ret void
}

attributes #0 = { nounwind }
199 changes: 97 additions & 102 deletions llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll

Large diffs are not rendered by default.

49 changes: 26 additions & 23 deletions llvm/test/Transforms/SLPVectorizer/X86/zext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -682,18 +682,20 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1
; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1
; SSE2-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64
; SSE2-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64
; SSE2-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64
; SSE2-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64
; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
; SSE2-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2]] to <2 x i32>*
; SSE2-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 1
; SSE2-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
; SSE2-NEXT: [[TMP6:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
; SSE2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0
; SSE2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1
; SSE2-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2
; SSE2-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3
; SSE2-NEXT: ret <4 x i64> [[V3]]
;
; SLM-LABEL: @loadext_4i32_to_4i64(
Expand All @@ -719,17 +721,18 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1
; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1
; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
; AVX1-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64
; AVX1-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64
; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2]] to <2 x i32>*
; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 1
; AVX1-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
; AVX1-NEXT: [[TMP6:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
; AVX1-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0
; AVX1-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1
; AVX1-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2
; AVX1-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3
; AVX1-NEXT: ret <4 x i64> [[V3]]
;
; AVX2-LABEL: @loadext_4i32_to_4i64(
Expand Down