From 612a4a3365d2f6ea17bcbef8e7481a6a1823ca87 Mon Sep 17 00:00:00 2001 From: Mikhail Gudim Date: Wed, 8 Oct 2025 12:48:05 -0700 Subject: [PATCH 1/9] [RISCV] Add a test for satd-8x4 from x264 benchmark. --- .../SLPVectorizer/RISCV/x264-satd-8x4.ll | 526 ++++++++++++++++++ 1 file changed, 526 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll new file mode 100644 index 0000000000000..c1042f1842107 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll @@ -0,0 +1,526 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem \ +; RUN: -passes=slp-vectorizer -S < %s | FileCheck %s +; Function Attrs: nounwind uwtable vscale_range(8,1024) +define i32 @x264_pixel_satd_8x4(ptr %pix1, i32 %i_pix1, ptr %pix2, i32 %i_pix2) { +; CHECK-LABEL: define i32 @x264_pixel_satd_8x4( +; CHECK-SAME: ptr [[PIX1:%.*]], i32 [[I_PIX1:%.*]], ptr [[PIX2:%.*]], i32 [[I_PIX2:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[I_PIX1]] to i64 +; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[I_PIX2]] to i64 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX1]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX2]], i64 4 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PIX1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[PIX2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64]], i64 4 +; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_1]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_1]], i64 4 +; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_2]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_2]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> [[TMP18]], <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP40]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], splat (i32 16) +; CHECK-NEXT: [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]] +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = add nsw <16 x i32> [[TMP52]], [[TMP51]] +; CHECK-NEXT: [[TMP54:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP51]] +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]] +; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]] +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = lshr <16 x i32> [[TMP67]], splat (i32 15) +; CHECK-NEXT: [[TMP69:%.*]] = and <16 x i32> [[TMP68]], splat (i32 65537) +; CHECK-NEXT: [[TMP70:%.*]] = mul nuw <16 x i32> [[TMP69]], splat (i32 65535) +; CHECK-NEXT: [[TMP71:%.*]] = add <16 x i32> [[TMP70]], [[TMP67]] +; CHECK-NEXT: [[TMP72:%.*]] = xor <16 x i32> [[TMP71]], [[TMP70]] +; CHECK-NEXT: [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP72]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP73]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP73]], 16 +; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] +; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 +; CHECK-NEXT: ret i32 [[SHR120]] +; +entry: + %idx.ext = sext i32 %i_pix1 to i64 + %idx.ext63 = sext i32 %i_pix2 to i64 + %0 = load i8, ptr %pix1, align 1 + %conv = zext i8 %0 to i32 + %1 = load i8, ptr %pix2, align 1 + %conv2 = zext i8 %1 to i32 + %sub = sub nsw i32 %conv, %conv2 + %arrayidx3 = getelementptr inbounds nuw i8, ptr %pix1, i64 4 + %2 = load i8, ptr %arrayidx3, align 1 + %conv4 = zext i8 %2 to i32 + %arrayidx5 = getelementptr inbounds nuw i8, ptr %pix2, i64 4 + %3 = load i8, ptr %arrayidx5, align 1 + %conv6 = zext i8 %3 to i32 + %sub7 = sub nsw i32 %conv4, %conv6 + %shl = shl nsw i32 %sub7, 16 + %add = add nsw i32 %shl, %sub + %arrayidx8 = getelementptr inbounds nuw i8, ptr %pix1, i64 1 + %4 = load i8, ptr %arrayidx8, align 1 + %conv9 = zext i8 %4 to i32 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %pix2, i64 1 + %5 = load i8, ptr %arrayidx10, align 1 + %conv11 = zext i8 %5 to i32 + %sub12 = sub nsw i32 %conv9, %conv11 + %arrayidx13 = getelementptr inbounds nuw i8, ptr %pix1, i64 5 + %6 = load i8, ptr %arrayidx13, align 1 + %conv14 = zext i8 %6 to i32 + %arrayidx15 = getelementptr inbounds nuw i8, ptr %pix2, i64 5 + %7 = load i8, ptr %arrayidx15, align 1 + %conv16 = zext i8 %7 to i32 + %sub17 = sub nsw i32 %conv14, %conv16 + %shl18 = shl nsw i32 %sub17, 16 + %add19 = add nsw i32 %shl18, %sub12 + %arrayidx20 = getelementptr inbounds nuw i8, ptr %pix1, i64 2 + %8 = load i8, ptr %arrayidx20, align 1 + %conv21 = zext i8 %8 to i32 + %arrayidx22 = getelementptr inbounds nuw i8, ptr %pix2, i64 2 + %9 = load i8, ptr %arrayidx22, align 1 + %conv23 = zext i8 %9 to i32 + %sub24 = sub nsw i32 %conv21, %conv23 + %arrayidx25 = getelementptr inbounds nuw i8, ptr %pix1, i64 6 + %10 = load i8, ptr %arrayidx25, align 1 + %conv26 = zext i8 %10 to i32 + %arrayidx27 = getelementptr inbounds nuw i8, ptr %pix2, i64 6 + %11 = load i8, ptr %arrayidx27, align 1 + %conv28 = zext i8 %11 to i32 + %sub29 = sub nsw i32 %conv26, %conv28 + %shl30 = shl nsw i32 %sub29, 16 + %add31 = add nsw i32 %shl30, %sub24 + %arrayidx32 = getelementptr inbounds nuw i8, ptr %pix1, i64 3 + %12 = load i8, ptr %arrayidx32, align 1 + %conv33 = zext i8 %12 to i32 + %arrayidx34 = getelementptr inbounds nuw i8, ptr %pix2, i64 3 + %13 = load i8, ptr %arrayidx34, align 1 + %conv35 = zext i8 %13 to i32 + %sub36 = sub nsw i32 %conv33, %conv35 + %arrayidx37 = getelementptr inbounds nuw i8, ptr %pix1, i64 7 + %14 = load i8, ptr %arrayidx37, align 1 + %conv38 = zext i8 %14 to i32 + %arrayidx39 = getelementptr inbounds nuw i8, ptr %pix2, i64 7 + %15 = load i8, ptr %arrayidx39, align 1 + %conv40 = zext i8 %15 to i32 + %sub41 = sub nsw i32 %conv38, %conv40 + %shl42 = shl nsw i32 %sub41, 16 + %add43 = add nsw i32 %shl42, %sub36 + %add44 = add nsw i32 %add19, %add + %sub45 = sub nsw i32 %add, %add19 + %add46 = add nsw i32 %add43, %add31 + %sub47 = sub nsw i32 %add31, %add43 + %add48 = add nsw i32 %add46, %add44 + %sub51 = sub nsw i32 %add44, %add46 + %add55 = add nsw i32 %sub47, %sub45 + %sub59 = sub nsw i32 %sub45, %sub47 + %add.ptr = getelementptr inbounds i8, ptr %pix1, i64 %idx.ext + %add.ptr64 = getelementptr inbounds i8, ptr %pix2, i64 %idx.ext63 + %16 = load i8, ptr %add.ptr, align 1 + %conv.1 = zext i8 %16 to i32 + %17 = load i8, ptr %add.ptr64, align 1 + %conv2.1 = zext i8 %17 to i32 + %sub.1 = sub nsw i32 %conv.1, %conv2.1 + %arrayidx3.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 4 + %18 = load i8, ptr %arrayidx3.1, align 1 + %conv4.1 = zext i8 %18 to i32 + %arrayidx5.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 4 + %19 = load i8, ptr %arrayidx5.1, align 1 + %conv6.1 = zext i8 %19 to i32 + %sub7.1 = sub nsw i32 %conv4.1, %conv6.1 + %shl.1 = shl nsw i32 %sub7.1, 16 + %add.1 = add nsw i32 %shl.1, %sub.1 + %arrayidx8.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 1 + %20 = load i8, ptr %arrayidx8.1, align 1 + %conv9.1 = zext i8 %20 to i32 + %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 1 + %21 = load i8, ptr %arrayidx10.1, align 1 + %conv11.1 = zext i8 %21 to i32 + %sub12.1 = sub nsw i32 %conv9.1, %conv11.1 + %arrayidx13.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 5 + %22 = load i8, ptr %arrayidx13.1, align 1 + %conv14.1 = zext i8 %22 to i32 + %arrayidx15.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 5 + %23 = load i8, ptr %arrayidx15.1, align 1 + %conv16.1 = zext i8 %23 to i32 + %sub17.1 = sub nsw i32 %conv14.1, %conv16.1 + %shl18.1 = shl nsw i32 %sub17.1, 16 + %add19.1 = add nsw i32 %shl18.1, %sub12.1 + %arrayidx20.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 2 + %24 = load i8, ptr %arrayidx20.1, align 1 + %conv21.1 = zext i8 %24 to i32 + %arrayidx22.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 2 + %25 = load i8, ptr %arrayidx22.1, align 1 + %conv23.1 = zext i8 %25 to i32 + %sub24.1 = sub nsw i32 %conv21.1, %conv23.1 + %arrayidx25.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 6 + %26 = load i8, ptr %arrayidx25.1, align 1 + %conv26.1 = zext i8 %26 to i32 + %arrayidx27.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 6 + %27 = load i8, ptr %arrayidx27.1, align 1 + %conv28.1 = zext i8 %27 to i32 + %sub29.1 = sub nsw i32 %conv26.1, %conv28.1 + %shl30.1 = shl nsw i32 %sub29.1, 16 + %add31.1 = add nsw i32 %shl30.1, %sub24.1 + %arrayidx32.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 3 + %28 = load i8, ptr %arrayidx32.1, align 1 + %conv33.1 = zext i8 %28 to i32 + %arrayidx34.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 3 + %29 = load i8, ptr %arrayidx34.1, align 1 + %conv35.1 = zext i8 %29 to i32 + %sub36.1 = sub nsw i32 %conv33.1, %conv35.1 + %arrayidx37.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 7 + %30 = load i8, ptr %arrayidx37.1, align 1 + %conv38.1 = zext i8 %30 to i32 + %arrayidx39.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 7 + %31 = load i8, ptr %arrayidx39.1, align 1 + %conv40.1 = zext i8 %31 to i32 + %sub41.1 = sub nsw i32 %conv38.1, %conv40.1 + %shl42.1 = shl nsw i32 %sub41.1, 16 + %add43.1 = add nsw i32 %shl42.1, %sub36.1 + %add44.1 = add nsw i32 %add19.1, %add.1 + %sub45.1 = sub nsw i32 %add.1, %add19.1 + %add46.1 = add nsw i32 %add43.1, %add31.1 + %sub47.1 = sub nsw i32 %add31.1, %add43.1 + %add48.1 = add nsw i32 %add46.1, %add44.1 + %sub51.1 = sub nsw i32 %add44.1, %add46.1 + %add55.1 = add nsw i32 %sub47.1, %sub45.1 + %sub59.1 = sub nsw i32 %sub45.1, %sub47.1 + %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext + %add.ptr64.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 %idx.ext63 + %32 = load i8, ptr %add.ptr.1, align 1 + %conv.2 = zext i8 %32 to i32 + %33 = load i8, ptr %add.ptr64.1, align 1 + %conv2.2 = zext i8 %33 to i32 + %sub.2 = sub nsw i32 %conv.2, %conv2.2 + %arrayidx3.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 4 + %34 = load i8, ptr %arrayidx3.2, align 1 + %conv4.2 = zext i8 %34 to i32 + %arrayidx5.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 4 + %35 = load i8, ptr %arrayidx5.2, align 1 + %conv6.2 = zext i8 %35 to i32 + %sub7.2 = sub nsw i32 %conv4.2, %conv6.2 + %shl.2 = shl nsw i32 %sub7.2, 16 + %add.2 = add nsw i32 %shl.2, %sub.2 + %arrayidx8.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 1 + %36 = load i8, ptr %arrayidx8.2, align 1 + %conv9.2 = zext i8 %36 to i32 + %arrayidx10.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 1 + %37 = load i8, ptr %arrayidx10.2, align 1 + %conv11.2 = zext i8 %37 to i32 + %sub12.2 = sub nsw i32 %conv9.2, %conv11.2 + %arrayidx13.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 5 + %38 = load i8, ptr %arrayidx13.2, align 1 + %conv14.2 = zext i8 %38 to i32 + %arrayidx15.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 5 + %39 = load i8, ptr %arrayidx15.2, align 1 + %conv16.2 = zext i8 %39 to i32 + %sub17.2 = sub nsw i32 %conv14.2, %conv16.2 + %shl18.2 = shl nsw i32 %sub17.2, 16 + %add19.2 = add nsw i32 %shl18.2, %sub12.2 + %arrayidx20.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 2 + %40 = load i8, ptr %arrayidx20.2, align 1 + %conv21.2 = zext i8 %40 to i32 + %arrayidx22.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 2 + %41 = load i8, ptr %arrayidx22.2, align 1 + %conv23.2 = zext i8 %41 to i32 + %sub24.2 = sub nsw i32 %conv21.2, %conv23.2 + %arrayidx25.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 6 + %42 = load i8, ptr %arrayidx25.2, align 1 + %conv26.2 = zext i8 %42 to i32 + %arrayidx27.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 6 + %43 = load i8, ptr %arrayidx27.2, align 1 + %conv28.2 = zext i8 %43 to i32 + %sub29.2 = sub nsw i32 %conv26.2, %conv28.2 + %shl30.2 = shl nsw i32 %sub29.2, 16 + %add31.2 = add nsw i32 %shl30.2, %sub24.2 + %arrayidx32.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 3 + %44 = load i8, ptr %arrayidx32.2, align 1 + %conv33.2 = zext i8 %44 to i32 + %arrayidx34.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 3 + %45 = load i8, ptr %arrayidx34.2, align 1 + %conv35.2 = zext i8 %45 to i32 + %sub36.2 = sub nsw i32 %conv33.2, %conv35.2 + %arrayidx37.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 7 + %46 = load i8, ptr %arrayidx37.2, align 1 + %conv38.2 = zext i8 %46 to i32 + %arrayidx39.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 7 + %47 = load i8, ptr %arrayidx39.2, align 1 + %conv40.2 = zext i8 %47 to i32 + %sub41.2 = sub nsw i32 %conv38.2, %conv40.2 + %shl42.2 = shl nsw i32 %sub41.2, 16 + %add43.2 = add nsw i32 %shl42.2, %sub36.2 + %add44.2 = add nsw i32 %add19.2, %add.2 + %sub45.2 = sub nsw i32 %add.2, %add19.2 + %add46.2 = add nsw i32 %add43.2, %add31.2 + %sub47.2 = sub nsw i32 %add31.2, %add43.2 + %add48.2 = add nsw i32 %add46.2, %add44.2 + %sub51.2 = sub nsw i32 %add44.2, %add46.2 + %add55.2 = add nsw i32 %sub47.2, %sub45.2 + %sub59.2 = sub nsw i32 %sub45.2, %sub47.2 + %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext + %add.ptr64.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 %idx.ext63 + %48 = load i8, ptr %add.ptr.2, align 1 + %conv.3 = zext i8 %48 to i32 + %49 = load i8, ptr %add.ptr64.2, align 1 + %conv2.3 = zext i8 %49 to i32 + %sub.3 = sub nsw i32 %conv.3, %conv2.3 + %arrayidx3.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 4 + %50 = load i8, ptr %arrayidx3.3, align 1 + %conv4.3 = zext i8 %50 to i32 + %arrayidx5.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 4 + %51 = load i8, ptr %arrayidx5.3, align 1 + %conv6.3 = zext i8 %51 to i32 + %sub7.3 = sub nsw i32 %conv4.3, %conv6.3 + %shl.3 = shl nsw i32 %sub7.3, 16 + %add.3 = add nsw i32 %shl.3, %sub.3 + %arrayidx8.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 1 + %52 = load i8, ptr %arrayidx8.3, align 1 + %conv9.3 = zext i8 %52 to i32 + %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 1 + %53 = load i8, ptr %arrayidx10.3, align 1 + %conv11.3 = zext i8 %53 to i32 + %sub12.3 = sub nsw i32 %conv9.3, %conv11.3 + %arrayidx13.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 5 + %54 = load i8, ptr %arrayidx13.3, align 1 + %conv14.3 = zext i8 %54 to i32 + %arrayidx15.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 5 + %55 = load i8, ptr %arrayidx15.3, align 1 + %conv16.3 = zext i8 %55 to i32 + %sub17.3 = sub nsw i32 %conv14.3, %conv16.3 + %shl18.3 = shl nsw i32 %sub17.3, 16 + %add19.3 = add nsw i32 %shl18.3, %sub12.3 + %arrayidx20.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 2 + %56 = load i8, ptr %arrayidx20.3, align 1 + %conv21.3 = zext i8 %56 to i32 + %arrayidx22.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 2 + %57 = load i8, ptr %arrayidx22.3, align 1 + %conv23.3 = zext i8 %57 to i32 + %sub24.3 = sub nsw i32 %conv21.3, %conv23.3 + %arrayidx25.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 6 + %58 = load i8, ptr %arrayidx25.3, align 1 + %conv26.3 = zext i8 %58 to i32 + %arrayidx27.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 6 + %59 = load i8, ptr %arrayidx27.3, align 1 + %conv28.3 = zext i8 %59 to i32 + %sub29.3 = sub nsw i32 %conv26.3, %conv28.3 + %shl30.3 = shl nsw i32 %sub29.3, 16 + %add31.3 = add nsw i32 %shl30.3, %sub24.3 + %arrayidx32.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 3 + %60 = load i8, ptr %arrayidx32.3, align 1 + %conv33.3 = zext i8 %60 to i32 + %arrayidx34.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 3 + %61 = load i8, ptr %arrayidx34.3, align 1 + %conv35.3 = zext i8 %61 to i32 + %sub36.3 = sub nsw i32 %conv33.3, %conv35.3 + %arrayidx37.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 7 + %62 = load i8, ptr %arrayidx37.3, align 1 + %conv38.3 = zext i8 %62 to i32 + %arrayidx39.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 7 + %63 = load i8, ptr %arrayidx39.3, align 1 + %conv40.3 = zext i8 %63 to i32 + %sub41.3 = sub nsw i32 %conv38.3, %conv40.3 + %shl42.3 = shl nsw i32 %sub41.3, 16 + %add43.3 = add nsw i32 %shl42.3, %sub36.3 + %add44.3 = add nsw i32 %add19.3, %add.3 + %sub45.3 = sub nsw i32 %add.3, %add19.3 + %add46.3 = add nsw i32 %add43.3, %add31.3 + %sub47.3 = sub nsw i32 %add31.3, %add43.3 + %add48.3 = add nsw i32 %add46.3, %add44.3 + %sub51.3 = sub nsw i32 %add44.3, %add46.3 + %add55.3 = add nsw i32 %sub47.3, %sub45.3 + %sub59.3 = sub nsw i32 %sub45.3, %sub47.3 + %add78 = add nsw i32 %add48.1, %add48 + %sub86 = sub nsw i32 %add48, %add48.1 + %add94 = add nsw i32 %add48.3, %add48.2 + %sub102 = sub nsw i32 %add48.2, %add48.3 + %add103 = add nsw i32 %add94, %add78 + %sub104 = sub nsw i32 %add78, %add94 + %add105 = add nsw i32 %sub102, %sub86 + %sub106 = sub nsw i32 %sub86, %sub102 + %shr.i = lshr i32 %add103, 15 + %and.i = and i32 %shr.i, 65537 + %mul.i = mul nuw i32 %and.i, 65535 + %add.i = add i32 %mul.i, %add103 + %xor.i = xor i32 %add.i, %mul.i + %shr.i169 = lshr i32 %add105, 15 + %and.i170 = and i32 %shr.i169, 65537 + %mul.i171 = mul nuw i32 %and.i170, 65535 + %add.i172 = add i32 %mul.i171, %add105 + %xor.i173 = xor i32 %add.i172, %mul.i171 + %shr.i174 = lshr i32 %sub104, 15 + %and.i175 = and i32 %shr.i174, 65537 + %mul.i176 = mul nuw i32 %and.i175, 65535 + %add.i177 = add i32 %mul.i176, %sub104 + %xor.i178 = xor i32 %add.i177, %mul.i176 + %shr.i179 = lshr i32 %sub106, 15 + %and.i180 = and i32 %shr.i179, 65537 + %mul.i181 = mul nuw i32 %and.i180, 65535 + %add.i182 = add i32 %mul.i181, %sub106 + %xor.i183 = xor i32 %add.i182, %mul.i181 + %add110 = add i32 %xor.i173, %xor.i + %add112 = add i32 %add110, %xor.i178 + %add113 = add i32 %add112, %xor.i183 + %add78.1 = add nsw i32 %add55.1, %add55 + %sub86.1 = sub nsw i32 %add55, %add55.1 + %add94.1 = add nsw i32 %add55.3, %add55.2 + %sub102.1 = sub nsw i32 %add55.2, %add55.3 + %add103.1 = add nsw i32 %add94.1, %add78.1 + %sub104.1 = sub nsw i32 %add78.1, %add94.1 + %add105.1 = add nsw i32 %sub102.1, %sub86.1 + %sub106.1 = sub nsw i32 %sub86.1, %sub102.1 + %shr.i.1 = lshr i32 %add103.1, 15 + %and.i.1 = and i32 %shr.i.1, 65537 + %mul.i.1 = mul nuw i32 %and.i.1, 65535 + %add.i.1 = add i32 %mul.i.1, %add103.1 + %xor.i.1 = xor i32 %add.i.1, %mul.i.1 + %shr.i169.1 = lshr i32 %add105.1, 15 + %and.i170.1 = and i32 %shr.i169.1, 65537 + %mul.i171.1 = mul nuw i32 %and.i170.1, 65535 + %add.i172.1 = add i32 %mul.i171.1, %add105.1 + %xor.i173.1 = xor i32 %add.i172.1, %mul.i171.1 + %shr.i174.1 = lshr i32 %sub104.1, 15 + %and.i175.1 = and i32 %shr.i174.1, 65537 + %mul.i176.1 = mul nuw i32 %and.i175.1, 65535 + %add.i177.1 = add i32 %mul.i176.1, %sub104.1 + %xor.i178.1 = xor i32 %add.i177.1, %mul.i176.1 + %shr.i179.1 = lshr i32 %sub106.1, 15 + %and.i180.1 = and i32 %shr.i179.1, 65537 + %mul.i181.1 = mul nuw i32 %and.i180.1, 65535 + %add.i182.1 = add i32 %mul.i181.1, %sub106.1 + %xor.i183.1 = xor i32 %add.i182.1, %mul.i181.1 + %add108.1 = add i32 %xor.i173.1, %add113 + %add110.1 = add i32 %add108.1, %xor.i.1 + %add112.1 = add i32 %add110.1, %xor.i178.1 + %add113.1 = add i32 %add112.1, %xor.i183.1 + %add78.2 = add nsw i32 %sub51.1, %sub51 + %sub86.2 = sub nsw i32 %sub51, %sub51.1 + %add94.2 = add nsw i32 %sub51.3, %sub51.2 + %sub102.2 = sub nsw i32 %sub51.2, %sub51.3 + %add103.2 = add nsw i32 %add94.2, %add78.2 + %sub104.2 = sub nsw i32 %add78.2, %add94.2 + %add105.2 = add nsw i32 %sub102.2, %sub86.2 + %sub106.2 = sub nsw i32 %sub86.2, %sub102.2 + %shr.i.2 = lshr i32 %add103.2, 15 + %and.i.2 = and i32 %shr.i.2, 65537 + %mul.i.2 = mul nuw i32 %and.i.2, 65535 + %add.i.2 = add i32 %mul.i.2, %add103.2 + %xor.i.2 = xor i32 %add.i.2, %mul.i.2 + %shr.i169.2 = lshr i32 %add105.2, 15 + %and.i170.2 = and i32 %shr.i169.2, 65537 + %mul.i171.2 = mul nuw i32 %and.i170.2, 65535 + %add.i172.2 = add i32 %mul.i171.2, %add105.2 + %xor.i173.2 = xor i32 %add.i172.2, %mul.i171.2 + %shr.i174.2 = lshr i32 %sub104.2, 15 + %and.i175.2 = and i32 %shr.i174.2, 65537 + %mul.i176.2 = mul nuw i32 %and.i175.2, 65535 + %add.i177.2 = add i32 %mul.i176.2, %sub104.2 + %xor.i178.2 = xor i32 %add.i177.2, %mul.i176.2 + %shr.i179.2 = lshr i32 %sub106.2, 15 + %and.i180.2 = and i32 %shr.i179.2, 65537 + %mul.i181.2 = mul nuw i32 %and.i180.2, 65535 + %add.i182.2 = add i32 %mul.i181.2, %sub106.2 + %xor.i183.2 = xor i32 %add.i182.2, %mul.i181.2 + %add108.2 = add i32 %xor.i173.2, %add113.1 + %add110.2 = add i32 %add108.2, %xor.i.2 + %add112.2 = add i32 %add110.2, %xor.i178.2 + %add113.2 = add i32 %add112.2, %xor.i183.2 + %add78.3 = add nsw i32 %sub59.1, %sub59 + %sub86.3 = sub nsw i32 %sub59, %sub59.1 + %add94.3 = add nsw i32 %sub59.3, %sub59.2 + %sub102.3 = sub nsw i32 %sub59.2, %sub59.3 + %add103.3 = add nsw i32 %add94.3, %add78.3 + %sub104.3 = sub nsw i32 %add78.3, %add94.3 + %add105.3 = add nsw i32 %sub102.3, %sub86.3 + %sub106.3 = sub nsw i32 %sub86.3, %sub102.3 + %shr.i.3 = lshr i32 %add103.3, 15 + %and.i.3 = and i32 %shr.i.3, 65537 + %mul.i.3 = mul nuw i32 %and.i.3, 65535 + %add.i.3 = add i32 %mul.i.3, %add103.3 + %xor.i.3 = xor i32 %add.i.3, %mul.i.3 + %shr.i169.3 = lshr i32 %add105.3, 15 + %and.i170.3 = and i32 %shr.i169.3, 65537 + %mul.i171.3 = mul nuw i32 %and.i170.3, 65535 + %add.i172.3 = add i32 %mul.i171.3, %add105.3 + %xor.i173.3 = xor i32 %add.i172.3, %mul.i171.3 + %shr.i174.3 = lshr i32 %sub104.3, 15 + %and.i175.3 = and i32 %shr.i174.3, 65537 + %mul.i176.3 = mul nuw i32 %and.i175.3, 65535 + %add.i177.3 = add i32 %mul.i176.3, %sub104.3 + %xor.i178.3 = xor i32 %add.i177.3, %mul.i176.3 + %shr.i179.3 = lshr i32 %sub106.3, 15 + %and.i180.3 = and i32 %shr.i179.3, 65537 + %mul.i181.3 = mul nuw i32 %and.i180.3, 65535 + %add.i182.3 = add i32 %mul.i181.3, %sub106.3 + %xor.i183.3 = xor i32 %add.i182.3, %mul.i181.3 + %add108.3 = add i32 %xor.i173.3, %add113.2 + %add110.3 = add i32 %add108.3, %xor.i.3 + %add112.3 = add i32 %add110.3, %xor.i178.3 + %add113.3 = add i32 %add112.3, %xor.i183.3 + %conv118 = and i32 %add113.3, 65535 + %shr = lshr i32 %add113.3, 16 + %add119 = add nuw nsw i32 %conv118, %shr + %shr120 = lshr i32 %add119, 1 + ret i32 %shr120 +} From 312f633a1616d35de86cf1e32afeefecc768d383 Mon Sep 17 00:00:00 2001 From: Mikhail Gudim Date: Tue, 23 Sep 2025 15:41:57 -0700 Subject: [PATCH 2/9] [SLPVectorizer] Widen constant strided loads. Given a set of pointers, check if they can be rearranged as follows (%s is a constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2 ... %b + 0 * %s + w %b + 1 * %s + 0 %b + 1 * %s + 1 %b + 1 * %s + 2 ... %b + 1 * %s + w ... If the pointers can be rearanged in the above pattern, it means that the memory can be accessed with a strided loads of width `w` and stride `%s`. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 168 ++++++++++++++---- .../RISCV/basic-strided-loads.ll | 18 +- 2 files changed, 134 insertions(+), 52 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 91c3d4270b241..19f4f06e413ec 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2242,8 +2242,29 @@ class BoUpSLP { /// may not be necessary. bool isLoadCombineCandidate(ArrayRef Stores) const; bool isStridedLoad(ArrayRef PointerOps, Type *ScalarTy, - Align Alignment, const int64_t Diff, Value *Ptr0, - Value *PtrN, StridedPtrInfo &SPtrInfo) const; + Align Alignment, int64_t Diff, size_t VecSz) const; + /// Given a set of pointers, check if they can be rearranged as follows (%s is + /// a constant): + /// %b + 0 * %s + 0 + /// %b + 0 * %s + 1 + /// %b + 0 * %s + 2 + /// ... + /// %b + 0 * %s + w + /// + /// %b + 1 * %s + 0 + /// %b + 1 * %s + 1 + /// %b + 1 * %s + 2 + /// ... + /// %b + 1 * %s + w + /// ... + /// + /// If the pointers can be rearanged in the above pattern, it means that the + /// memory can be accessed with a strided loads of width `w` and stride `%s`. + bool analyzeConstantStrideCandidate(ArrayRef PointerOps, + Type *ElemTy, Align CommonAlignment, + SmallVectorImpl &SortedIndices, + int64_t Diff, Value *Ptr0, Value *PtrN, + StridedPtrInfo &SPtrInfo) const; /// Checks if the given array of loads can be represented as a vectorized, /// scatter or just simple gather. @@ -6824,12 +6845,7 @@ isMaskedLoadCompress(ArrayRef VL, ArrayRef PointerOps, /// current graph (for masked gathers extra extractelement instructions /// might be required). bool BoUpSLP::isStridedLoad(ArrayRef PointerOps, Type *ScalarTy, - Align Alignment, const int64_t Diff, Value *Ptr0, - Value *PtrN, StridedPtrInfo &SPtrInfo) const { - const size_t Sz = PointerOps.size(); - if (Diff % (Sz - 1) != 0) - return false; - + Align Alignment, int64_t Diff, size_t VecSz) const { // Try to generate strided load node. auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) { return isa(V) && any_of(V->users(), [&](User *U) { @@ -6838,41 +6854,109 @@ bool BoUpSLP::isStridedLoad(ArrayRef PointerOps, Type *ScalarTy, }); const uint64_t AbsoluteDiff = std::abs(Diff); - auto *VecTy = getWidenedType(ScalarTy, Sz); + auto *VecTy = getWidenedType(ScalarTy, VecSz); if (IsAnyPointerUsedOutGraph || - (AbsoluteDiff > Sz && - (Sz > MinProfitableStridedLoads || - (AbsoluteDiff <= MaxProfitableLoadStride * Sz && - AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) || - Diff == -(static_cast(Sz) - 1)) { - int64_t Stride = Diff / static_cast(Sz - 1); - if (Diff != Stride * static_cast(Sz - 1)) + (AbsoluteDiff > VecSz && + (VecSz > MinProfitableStridedLoads || + (AbsoluteDiff <= MaxProfitableLoadStride * VecSz && + AbsoluteDiff % VecSz == 0 && has_single_bit(AbsoluteDiff / VecSz)))) || + Diff == -(static_cast(VecSz) - 1)) { + int64_t Stride = Diff / static_cast(VecSz - 1); + if (Diff != Stride * static_cast(VecSz - 1)) return false; if (!TTI->isLegalStridedLoadStore(VecTy, Alignment)) return false; + } + return true; +} + +bool BoUpSLP::analyzeConstantStrideCandidate( + ArrayRef PointerOps, Type *ElemTy, Align CommonAlignment, + SmallVectorImpl &SortedIndices, int64_t Diff, Value *Ptr0, + Value *PtrN, StridedPtrInfo &SPtrInfo) const { + const unsigned Sz = PointerOps.size(); + SmallVector SortedOffsetsFromBase; + SortedOffsetsFromBase.resize(Sz); + for (unsigned I : seq(Sz)) { + Value *Ptr = + SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]]; + SortedOffsetsFromBase[I] = + *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE); + } + assert(SortedOffsetsFromBase.size() > 1 && + "Trying to generate strided load for less than 2 loads"); + // + // Find where the first group ends. + int64_t StrideWithinGroup = + SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0]; + unsigned GroupSize = 1; + for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) { + if (SortedOffsetsFromBase[GroupSize] - + SortedOffsetsFromBase[GroupSize - 1] != + StrideWithinGroup) + break; + } + unsigned VecSz = Sz; + Type *ScalarTy = ElemTy; + int64_t StrideIntVal = StrideWithinGroup; + FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz); + + if (Sz != GroupSize) { + if (Sz % GroupSize != 0) + return false; + VecSz = Sz / GroupSize; + + if (StrideWithinGroup != 1) + return false; + unsigned VecSz = Sz / GroupSize; + ScalarTy = Type::getIntNTy(SE->getContext(), + DL->getTypeSizeInBits(ElemTy).getFixedValue() * + GroupSize); + StridedLoadTy = getWidenedType(ScalarTy, VecSz); + if (!TTI->isTypeLegal(StridedLoadTy) || + !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment)) + return false; - // Iterate through all pointers and check if all distances are - // unique multiple of Dist. - SmallSet Dists; - for (Value *Ptr : PointerOps) { - int64_t Dist = 0; - if (Ptr == PtrN) - Dist = Diff; - else if (Ptr != Ptr0) - Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); - // If the strides are not the same or repeated, we can't - // vectorize. - if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) + unsigned PrevGroupStartIdx = 0; + unsigned CurrentGroupStartIdx = GroupSize; + int64_t StrideBetweenGroups = + SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0]; + StrideIntVal = StrideBetweenGroups; + while (CurrentGroupStartIdx != Sz) { + if (SortedOffsetsFromBase[CurrentGroupStartIdx] - + SortedOffsetsFromBase[PrevGroupStartIdx] != + StrideBetweenGroups) break; + PrevGroupStartIdx = CurrentGroupStartIdx; + CurrentGroupStartIdx += GroupSize; } - if (Dists.size() == Sz) { - Type *StrideTy = DL->getIndexType(Ptr0->getType()); - SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride); - SPtrInfo.Ty = getWidenedType(ScalarTy, Sz); - return true; + if (CurrentGroupStartIdx != Sz) + return false; + + auto CheckGroup = [&](unsigned StartIdx, unsigned GroupSize0, + int64_t StrideWithinGroup) -> bool { + unsigned GroupEndIdx = StartIdx + 1; + for (; GroupEndIdx != Sz; ++GroupEndIdx) { + if (SortedOffsetsFromBase[GroupEndIdx] - + SortedOffsetsFromBase[GroupEndIdx - 1] != + StrideWithinGroup) + break; + } + return GroupEndIdx - StartIdx == GroupSize0; + }; + for (unsigned I = 0; I < Sz; I += GroupSize) { + if (!CheckGroup(I, GroupSize, StrideWithinGroup)) + return false; } } - return false; + + if (!isStridedLoad(PointerOps, ScalarTy, CommonAlignment, Diff, VecSz)) + return false; + + Type *StrideTy = DL->getIndexType(Ptr0->getType()); + SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal); + SPtrInfo.Ty = StridedLoadTy; + return true; } BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( @@ -6958,8 +7042,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( Align Alignment = cast(Order.empty() ? VL.front() : VL[Order.front()]) ->getAlign(); - if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN, - SPtrInfo)) + if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order, + *Diff, Ptr0, PtrN, SPtrInfo)) return LoadsState::StridedVectorize; } if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || @@ -14867,11 +14951,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } break; case TreeEntry::StridedVectorize: { + const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E); + FixedVectorType *StridedLoadTy = SPtrInfo.Ty; + assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry."); Align CommonAlignment = computeCommonAlignment(UniqueValues.getArrayRef()); VecLdCost = TTI->getStridedMemoryOpCost( - Instruction::Load, VecTy, LI0->getPointerOperand(), + Instruction::Load, StridedLoadTy, LI0->getPointerOperand(), /*VariableMask=*/false, CommonAlignment, CostKind); + if (StridedLoadTy != VecTy) + VecLdCost += + TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy, + getCastContextHint(*E), CostKind); + break; } case TreeEntry::CompressVectorize: { @@ -19635,6 +19727,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ? NewLI : ::propagateMetadata(NewLI, E->Scalars); + if (StridedLoadTy != VecTy) + V = Builder.CreateBitOrPointerCast(V, VecTy); V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll index 02e05b2e4138a..968e61fb5f9e0 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll @@ -621,22 +621,10 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) ; CHECK-LABEL: define void @constant_stride_widen_no_reordering( ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0 -; CHECK-NEXT: [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 100 -; CHECK-NEXT: [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 200 -; CHECK-NEXT: [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 300 ; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> -; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 [[GEP_L0]], i64 100, <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16 ; CHECK-NEXT: ret void ; %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0 From a7fe9eb009827ca1fa8a87fe8b70c6cb005f9abe Mon Sep 17 00:00:00 2001 From: Mikhail Gudim Date: Wed, 24 Sep 2025 10:40:27 -0700 Subject: [PATCH 3/9] updated failing tests. --- .../RISCV/gather-insert-point-restore.ll | 15 +-- .../X86/entries-shuffled-diff-sizes.ll | 16 +-- .../X86/extractelements-vector-ops-shuffle.ll | 7 +- .../Transforms/SLPVectorizer/X86/pr47623.ll | 14 +-- .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 112 +++++++++--------- .../Transforms/SLPVectorizer/X86/pr47629.ll | 112 +++++++++--------- .../X86/redux-feed-buildvector.ll | 73 +++++++++--- .../X86/reorder-possible-strided-node.ll | 36 ++---- .../X86/split-node-num-operands.ll | 22 ++-- .../X86/split-vector-operand-with-reuses.ll | 28 +++-- 10 files changed, 226 insertions(+), 209 deletions(-) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll index 82c940353ba5a..60d3b291b5dd4 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll @@ -5,18 +5,19 @@ define i16 @test(ptr %i) { ; CHECK-LABEL: define i16 @test( ; CHECK-SAME: ptr [[I:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x ptr> [[TMP2]], <2 x ptr> poison, <4 x i32> +; CHECK-NEXT: [[GEP_US154:%.*]] = getelementptr i8, ptr [[I]], i64 132860 ; CHECK-NEXT: [[GEP_US154_2:%.*]] = getelementptr i8, ptr [[I]], i64 142688 ; CHECK-NEXT: br label %[[FOR_COND5_US:.*]] ; CHECK: [[FOR_COND5_US]]: +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[GEP_US154]], i64 4914, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i64(ptr align 2 [[GEP_US154_2]], i64 4914, <4 x i1> splat (i1 true), i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> [[TMP3]], i32 2, <4 x i1> splat (i1 true), <4 x i16> poison) +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[GEP_US154]], i64 4914, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP5]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[TMP7]]) ; CHECK-NEXT: [[TMP9:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP8]], i16 0) ; CHECK-NEXT: ret i16 [[TMP9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll index b99a1c2d83394..dbcaafa9e5a8b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll @@ -15,13 +15,15 @@ define void @test() { ; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]] ; CHECK-NEXT: store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 -; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <88 x float> @llvm.masked.load.v88f32.p0(ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), i32 16, <88 x i1> , <88 x float> poison) +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <88 x float> [[TMP7]], <88 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 8 getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), i64 336, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <88 x float> [[TMP7]], <88 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP14]], [[TMP17]] ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP18]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: store <16 x float> [[TMP15]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll index 7bbc694dc5181..12cb86287ce6f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll @@ -4,12 +4,11 @@ define double @test() { ; CHECK-LABEL: define double @test() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 6), align 16 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 5), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 9), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 8), align 16 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x double> , double [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i64(ptr align 8 getelementptr inbounds ([13 x double], ptr null, i64 0, i64 6), i64 24, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP3]], <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call reassoc nsz double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = fmul double [[TMP6]], 0.000000e+00 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll index 1b11c3dcc081c..a577469a9aef0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -11,16 +11,10 @@ define void @foo() { ; SSE-LABEL: @foo( -; SSE-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 -; SSE-NEXT: store i32 [[TMP1]], ptr @a, align 16 -; SSE-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8 -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 1), align 4 -; SSE-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 2), align 8 -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 3), align 4 -; SSE-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 4), align 16 -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 5), align 4 -; SSE-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 6), align 8 -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16 +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; SSE-NEXT: store <8 x i32> [[TMP3]], ptr @a, align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @foo( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index fde76f8b0e8b9..ca129e7ab97f8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -153,36 +153,46 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 ; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP4]], 1 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1 +; AVX-NEXT: store i32 [[TMP11]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP15:%.*]] = add nsw i32 [[TMP8]], 2 +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX-NEXT: store i32 [[TMP15]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX-NEXT: store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 -; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 -; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 +; AVX-NEXT: store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: define void @gather_load_2( ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 ; AVX2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP4]], 1 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1 +; AVX2-NEXT: store i32 [[TMP11]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP15:%.*]] = add nsw i32 [[TMP8]], 2 +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX2-NEXT: store i32 [[TMP15]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX2-NEXT: store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 -; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 -; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX2-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 +; AVX2-NEXT: store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: define void @gather_load_2( @@ -569,11 +579,9 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX-LABEL: define void @gather_load_div( ; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; AVX-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 ; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; AVX-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 @@ -583,30 +591,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; AVX-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; AVX-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; AVX-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; AVX-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; AVX-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 -; AVX-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> +; AVX-NEXT: [[TMP22:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP29:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP5]], i64 8, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP27:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP7]], i64 120, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP17]], i64 20, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP22]], <8 x i32> +; AVX-NEXT: [[TMP30:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP3]], i32 0 ; AVX-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> +; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP33]], <8 x i32> ; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4 ; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7 -; AVX-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> -; AVX-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP28]], i32 7 +; AVX-NEXT: [[TMP37:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> [[TMP29]], <8 x i32> +; AVX-NEXT: [[TMP41:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> [[TMP33]], <8 x i32> ; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4 ; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5 ; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6 @@ -618,11 +621,9 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-LABEL: define void @gather_load_div( ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; AVX2-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 @@ -632,30 +633,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; AVX2-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; AVX2-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; AVX2-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 -; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> +; AVX2-NEXT: [[TMP22:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP29:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP5]], i64 8, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP27:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP7]], i64 120, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP25:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP17]], i64 20, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP22]], <8 x i32> +; AVX2-NEXT: [[TMP30:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP3]], i32 0 ; AVX2-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> +; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP33]], <8 x i32> ; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4 ; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7 -; AVX2-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> -; AVX2-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP28]], i32 7 +; AVX2-NEXT: [[TMP37:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> [[TMP29]], <8 x i32> +; AVX2-NEXT: [[TMP41:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> [[TMP33]], <8 x i32> ; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4 ; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5 ; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index cf380f04a6939..f651fa53c53c4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -153,36 +153,46 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 ; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP4]], 1 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1 +; AVX-NEXT: store i32 [[TMP11]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP15:%.*]] = add nsw i32 [[TMP8]], 2 +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX-NEXT: store i32 [[TMP15]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX-NEXT: store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 -; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 -; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 +; AVX-NEXT: store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: define void @gather_load_2( ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 ; AVX2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP4]], 1 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1 +; AVX2-NEXT: store i32 [[TMP11]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP15:%.*]] = add nsw i32 [[TMP8]], 2 +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; AVX2-NEXT: store i32 [[TMP15]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 +; AVX2-NEXT: store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 -; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 -; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX2-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 +; AVX2-NEXT: store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: define void @gather_load_2( @@ -569,11 +579,9 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX-LABEL: define void @gather_load_div( ; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; AVX-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 ; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; AVX-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 @@ -583,30 +591,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; AVX-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; AVX-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; AVX-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; AVX-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; AVX-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 -; AVX-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> +; AVX-NEXT: [[TMP22:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP29:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP5]], i64 8, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP27:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP7]], i64 120, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP17]], i64 20, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP22]], <8 x i32> +; AVX-NEXT: [[TMP30:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP3]], i32 0 ; AVX-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> +; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP33]], <8 x i32> ; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4 ; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7 -; AVX-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> -; AVX-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP28]], i32 7 +; AVX-NEXT: [[TMP37:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> [[TMP29]], <8 x i32> +; AVX-NEXT: [[TMP41:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> [[TMP33]], <8 x i32> ; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4 ; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5 ; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6 @@ -618,11 +621,9 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-LABEL: define void @gather_load_div( ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] { ; AVX2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 ; AVX2-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 @@ -632,30 +633,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 ; AVX2-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 ; AVX2-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 ; AVX2-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 -; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> +; AVX2-NEXT: [[TMP22:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP29:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP5]], i64 8, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP27:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP7]], i64 120, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP25:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP17]], i64 20, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP22]], <8 x i32> +; AVX2-NEXT: [[TMP30:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP3]], i32 0 ; AVX2-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> +; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP33]], <8 x i32> ; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4 ; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7 -; AVX2-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> -; AVX2-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP28]], i32 7 +; AVX2-NEXT: [[TMP37:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> [[TMP29]], <8 x i32> +; AVX2-NEXT: [[TMP41:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> [[TMP33]], <8 x i32> ; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4 ; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5 ; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll index f921278cdecf3..5bf3783034190 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll @@ -11,23 +11,62 @@ define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1 -; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8 -; CHECK-NEXT: [[GEP2_4:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20 -; CHECK-NEXT: [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> , <15 x double> poison) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP4]]) -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x double>, ptr [[GEP2_4]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[RDX_OP:%.*]] = fadd fast <4 x double> [[TMP6]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[RDX_OP]]) -; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0 -; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1 +; CHECK-NEXT: [[LD1_0:%.*]] = load double, ptr [[GEP1_0]], align 8 +; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 3 +; CHECK-NEXT: [[LD1_1:%.*]] = load double, ptr [[GEP1_1]], align 8 +; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 5 +; CHECK-NEXT: [[LD1_2:%.*]] = load double, ptr [[GEP1_2]], align 8 +; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 7 +; CHECK-NEXT: [[LD1_3:%.*]] = load double, ptr [[GEP1_3]], align 8 +; CHECK-NEXT: [[GEP1_4:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 9 +; CHECK-NEXT: [[LD1_4:%.*]] = load double, ptr [[GEP1_4]], align 8 +; CHECK-NEXT: [[GEP1_5:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 11 +; CHECK-NEXT: [[LD1_5:%.*]] = load double, ptr [[GEP1_5]], align 8 +; CHECK-NEXT: [[GEP1_6:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 13 +; CHECK-NEXT: [[LD1_6:%.*]] = load double, ptr [[GEP1_6]], align 8 +; CHECK-NEXT: [[GEP1_7:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 15 +; CHECK-NEXT: [[LD1_7:%.*]] = load double, ptr [[GEP1_7]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = call <24 x double> @llvm.masked.load.v24f64.p0(ptr [[ARG1:%.*]], i32 8, <24 x i1> , <24 x double> poison) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[LD1_0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[LD1_1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x double> poison, double [[LD1_2]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x double> poison, double [[LD1_3]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP17]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = fmul fast <2 x double> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = fadd fast <2 x double> [[TMP15]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> poison, double [[LD1_4]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x double> [[TMP22]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = fmul fast <2 x double> [[TMP21]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd fast <2 x double> [[TMP20]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> poison, double [[LD1_5]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = fmul fast <2 x double> [[TMP26]], [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = fadd fast <2 x double> [[TMP25]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> poison, double [[LD1_6]], i32 0 +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <2 x double> [[TMP32]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP34:%.*]] = fmul fast <2 x double> [[TMP31]], [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = fadd fast <2 x double> [[TMP30]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <2 x double> poison, double [[LD1_7]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <2 x double> [[TMP37]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP39:%.*]] = fmul fast <2 x double> [[TMP36]], [[TMP38]] +; CHECK-NEXT: [[I143:%.*]] = fadd fast <2 x double> [[TMP35]], [[TMP39]] ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> ; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true)) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll index 19ce11c457f63..bd24093218874 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll @@ -5,18 +5,15 @@ define void @test() { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX13]], i64 -128, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP9]], i32 3 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP0]], <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> @@ -66,18 +63,15 @@ define void @test1() { ; CHECK-LABEL: define void @test1( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX13]], i64 -128, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP9]], i32 3 ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP0]], <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer @@ -129,18 +123,15 @@ define void @test_div() { ; CHECK-LABEL: define void @test_div( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX13]], i64 -128, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP0]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> @@ -190,18 +181,15 @@ define void @test_rem() { ; CHECK-LABEL: define void @test_rem( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX13]], i64 -128, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP0]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll index 5aa4dba2b8a1b..4deddc138727a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll @@ -13,31 +13,31 @@ define i64 @Foo(ptr align 8 dereferenceable(344) %0, i64 %1) { ; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP7]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> , i64 [[TMP1]], i32 1 ; CHECK-NEXT: br label %[[BB16:.*]] ; CHECK: [[BB16]]: -; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i64> [ [[TMP11]], [[TMP2:%.*]] ], [ zeroinitializer, %[[TMP25:.*]] ] -; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i64> [ [[TMP13]], [[TMP2]] ], [ [[TMP29:%.*]], %[[TMP25]] ] +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i64> [ [[TMP11]], [[TMP2:%.*]] ], [ zeroinitializer, %[[_LOOPEXIT206:.*]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i64> [ [[TMP13]], [[TMP2]] ], [ [[TMP29:%.*]], %[[_LOOPEXIT206]] ] ; CHECK-NEXT: switch i32 0, label %[[BB19:.*]] [ -; CHECK-NEXT: i32 0, label %[[TMP25]] +; CHECK-NEXT: i32 0, label %[[_LOOPEXIT206]] ; CHECK-NEXT: ] ; CHECK: [[BB19]]: -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 0, i32 1 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 0, i32 2 ; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i64> [[TMP22]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP14]], <2 x i64> [[TMP18]], <2 x i32> -; CHECK-NEXT: br label %[[TMP25]] -; CHECK: [[TMP25]]: +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP14]], <2 x i64> [[TMP18]], <2 x i32> +; CHECK-NEXT: br label %[[_LOOPEXIT206]] +; CHECK: [[_LOOPEXIT206]]: ; CHECK-NEXT: [[TMP26:%.*]] = phi <2 x i64> [ [[TMP17]], %[[BB19]] ], [ zeroinitializer, %[[BB16]] ] ; CHECK-NEXT: [[TMP27:%.*]] = phi <4 x i64> [ [[TMP23]], %[[BB19]] ], [ zeroinitializer, %[[BB16]] ] ; CHECK-NEXT: [[TMP28:%.*]] = phi <2 x i64> [ [[TMP24]], %[[BB19]] ], [ [[TMP15]], %[[BB16]] ] -; CHECK-NEXT: [[TMP29]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> , <2 x i32> +; CHECK-NEXT: [[TMP29]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> , <2 x i32> ; CHECK-NEXT: br i1 false, label %[[DOTLOOPEXIT206:.*]], label %[[BB16]] -; CHECK: [[_LOOPEXIT206:.*:]] +; CHECK: [[_LOOPEXIT207:.*:]] ; CHECK-NEXT: switch i32 0, label %[[BB32:.*]] [ ; CHECK-NEXT: i32 0, [[DOTCONT174:label %.*]] ; CHECK-NEXT: i32 1, label %[[BB30:.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll index 972a58cecc822..ed0782fb5b84d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll @@ -6,31 +6,33 @@ define void @test(ptr %p) { ; CHECK-SAME: ptr [[P:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[ARRAYIDX7_US_I_841:%.*]] = getelementptr i8, ptr [[P]], i64 36 +; CHECK-NEXT: [[ARRAYIDX7_US_I_1051:%.*]] = getelementptr i8, ptr [[P]], i64 44 ; CHECK-NEXT: [[ARRAYIDX7_US_I_1261:%.*]] = getelementptr i8, ptr [[P]], i64 52 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_1261]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX7_US_I_1051]], i64 -44, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> , <16 x i32> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_841]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <12 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <12 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> [[TMP20]], <12 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> , <16 x i32> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 6 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> , <16 x i32> [[TMP9]], <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = add <16 x i32> [[TMP3]], [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = srem <16 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = or <12 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = srem <12 x i32> [[TMP15]], +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_841]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = srem <4 x i32> [[TMP16]], splat (i32 1) +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = srem <8 x i32> [[TMP21]], ; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER_US_I:.*]] ; CHECK: [[FOR_COND1_PREHEADER_US_I]]: ; CHECK-NEXT: [[A_PROMOTED253537_US_I:%.*]] = phi i32 [ [[OP_RDX8:%.*]], %[[FOR_COND1_PREHEADER_US_I]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP14]]) -; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v12i32(<12 x i32> [[TMP16]]) +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[RDX_OP:%.*]] = add <4 x i32> [[TMP22]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i32> [[RDX_OP]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> [[TMP23]], <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP24]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP18]], [[TMP17]] ; CHECK-NEXT: [[OP_RDX8]] = add i32 [[OP_RDX]], 0 ; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER_US_I]] From 25ba2a7ccb5d6cbd8ac8bd04945e325eb219c6fb Mon Sep 17 00:00:00 2001 From: Mikhail Gudim Date: Thu, 2 Oct 2025 10:05:23 -0700 Subject: [PATCH 4/9] rename VecSz to Sz to make diff look better. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 19f4f06e413ec..722de48e81e8a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2242,7 +2242,7 @@ class BoUpSLP { /// may not be necessary. bool isLoadCombineCandidate(ArrayRef Stores) const; bool isStridedLoad(ArrayRef PointerOps, Type *ScalarTy, - Align Alignment, int64_t Diff, size_t VecSz) const; + Align Alignment, int64_t Diff, size_t Sz) const; /// Given a set of pointers, check if they can be rearranged as follows (%s is /// a constant): /// %b + 0 * %s + 0 @@ -6845,7 +6845,7 @@ isMaskedLoadCompress(ArrayRef VL, ArrayRef PointerOps, /// current graph (for masked gathers extra extractelement instructions /// might be required). bool BoUpSLP::isStridedLoad(ArrayRef PointerOps, Type *ScalarTy, - Align Alignment, int64_t Diff, size_t VecSz) const { + Align Alignment, int64_t Diff, size_t Sz) const { // Try to generate strided load node. auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) { return isa(V) && any_of(V->users(), [&](User *U) { @@ -6854,15 +6854,15 @@ bool BoUpSLP::isStridedLoad(ArrayRef PointerOps, Type *ScalarTy, }); const uint64_t AbsoluteDiff = std::abs(Diff); - auto *VecTy = getWidenedType(ScalarTy, VecSz); + auto *VecTy = getWidenedType(ScalarTy, Sz); if (IsAnyPointerUsedOutGraph || - (AbsoluteDiff > VecSz && - (VecSz > MinProfitableStridedLoads || - (AbsoluteDiff <= MaxProfitableLoadStride * VecSz && - AbsoluteDiff % VecSz == 0 && has_single_bit(AbsoluteDiff / VecSz)))) || - Diff == -(static_cast(VecSz) - 1)) { - int64_t Stride = Diff / static_cast(VecSz - 1); - if (Diff != Stride * static_cast(VecSz - 1)) + (AbsoluteDiff > Sz && + (Sz > MinProfitableStridedLoads || + (AbsoluteDiff <= MaxProfitableLoadStride * Sz && + AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) || + Diff == -(static_cast(Sz) - 1)) { + int64_t Stride = Diff / static_cast(Sz - 1); + if (Diff != Stride * static_cast(Sz - 1)) return false; if (!TTI->isLegalStridedLoadStore(VecTy, Alignment)) return false; From 642477de25f5c54a7bef313923a1156ea537a027 Mon Sep 17 00:00:00 2001 From: Mikhail Gudim Date: Thu, 2 Oct 2025 10:39:33 -0700 Subject: [PATCH 5/9] update test. --- .../Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll index 968e61fb5f9e0..12725e7d46273 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -mtriple=riscv64 -mattr=+m,+v -passes=slp-vectorizer -S < %s | FileCheck %s +; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem -passes=slp-vectorizer -S < %s | FileCheck %s define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) { ; CHECK-LABEL: define void @const_stride_1_no_reordering( @@ -622,9 +622,9 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0 ; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 [[GEP_L0]], i64 100, <4 x i1> splat (i1 true), i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> -; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[GEP_L0]], i64 100, <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +; CHECK-NEXT: store <16 x i8> [[TMP11]], ptr [[GEP_S0]], align 1 ; CHECK-NEXT: ret void ; %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0 From 585eb0820d7e57599f9a1f885d62bef8f4bcd52f Mon Sep 17 00:00:00 2001 From: Mikhail Gudim Date: Fri, 3 Oct 2025 08:22:33 -0700 Subject: [PATCH 6/9] [SLPVectorizer] Move size checks (NFC). Add the `analyzeRtStrideCandidate` function. In the future commits we're going to add the capability to widen strided loads to it. So, in this commit, we move the size / type checks into it, since it can possibly change size / type of load. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 38 ++++++++++++++----- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 722de48e81e8a..f4c30357ff70f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2266,6 +2266,11 @@ class BoUpSLP { int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const; + bool analyzeRtStrideCandidate(ArrayRef PointerOps, Type *ScalarTy, + Align CommonAlignment, + SmallVectorImpl &SortedIndices, + StridedPtrInfo &SPtrInfo) const; + /// Checks if the given array of loads can be represented as a vectorized, /// scatter or just simple gather. /// \param VL list of loads. @@ -6959,6 +6964,27 @@ bool BoUpSLP::analyzeConstantStrideCandidate( return true; } +bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef PointerOps, + Type *ScalarTy, Align CommonAlignment, + SmallVectorImpl &SortedIndices, + StridedPtrInfo &SPtrInfo) const { + return true; + const unsigned Sz = PointerOps.size(); + // TODO: VecSz may change if we widen the strided load. + unsigned VecSz = Sz; + FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz); + if (!(Sz > MinProfitableStridedLoads && TTI->isTypeLegal(StridedLoadTy) && + TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))) + return false; + if (const SCEV *Stride = + calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) { + SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size()); + SPtrInfo.StrideSCEV = Stride; + return true; + } + return false; +} + BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, SmallVectorImpl &PointerOps, StridedPtrInfo &SPtrInfo, @@ -6999,15 +7025,9 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( auto *VecTy = getWidenedType(ScalarTy, Sz); Align CommonAlignment = computeCommonAlignment(VL); if (!IsSorted) { - if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) { - if (const SCEV *Stride = - calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order); - Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) { - SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size()); - SPtrInfo.StrideSCEV = Stride; - return LoadsState::StridedVectorize; - } - } + if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order, + SPtrInfo)) + return LoadsState::StridedVectorize; if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) From fff535af71e26ff32765d0bdc3b9808e3716f6ce Mon Sep 17 00:00:00 2001 From: Mikhail Gudim Date: Fri, 3 Oct 2025 10:11:06 -0700 Subject: [PATCH 7/9] removed wrong `return true`. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f4c30357ff70f..79cad4c33b54c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6968,7 +6968,6 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl &SortedIndices, StridedPtrInfo &SPtrInfo) const { - return true; const unsigned Sz = PointerOps.size(); // TODO: VecSz may change if we widen the strided load. unsigned VecSz = Sz; From b9c6119bd6b8a94b44f5bc148d036be1df56a67f Mon Sep 17 00:00:00 2001 From: Mikhail Gudim Date: Fri, 3 Oct 2025 11:50:43 -0700 Subject: [PATCH 8/9] [SLPVectorizer] Widen run-time-strided loads. Suppose we are given pointers of the form: `%b + x * %s + y * %c_i` where `%c_i`s are constants and %s is a run-time fixed value. If the pointers can be rearranged as follows: ``` %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2 ... %b + 0 * %s + w %b + 1 * %s + 0 %b + 1 * %s + 1 %b + 1 * %s + 2 ... %b + 1 * %s + w ... ``` It means that the memory can be accessed with a strided loads of width `w` and stride `%s`. This is motivated by x264 benchmark. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 126 ++++++++++++++++-- .../RISCV/basic-strided-loads.ll | 80 ++++++++++- 2 files changed, 192 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 79cad4c33b54c..0b61e59e10546 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2266,7 +2266,7 @@ class BoUpSLP { int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const; - bool analyzeRtStrideCandidate(ArrayRef PointerOps, Type *ScalarTy, + bool analyzeRtStrideCandidate(ArrayRef PointerOps, Type *ElemTy, Align CommonAlignment, SmallVectorImpl &SortedIndices, StridedPtrInfo &SPtrInfo) const; @@ -6403,7 +6403,8 @@ static bool isReverseOrder(ArrayRef Order) { /// Otherwise, SCEV* of the stride value is returned. static const SCEV *calculateRtStride(ArrayRef PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, - SmallVectorImpl &SortedIndices) { + SmallVectorImpl &SortedIndices, + SmallVectorImpl &Coeffs) { SmallVector SCEVs; const SCEV *PtrSCEVLowest = nullptr; const SCEV *PtrSCEVHighest = nullptr; @@ -6478,12 +6479,14 @@ static const SCEV *calculateRtStride(ArrayRef PointerOps, Type *ElemTy, const auto *SC = dyn_cast(Coeff); if (!SC || isa(SC)) return nullptr; + Coeffs.push_back((int64_t)SC->getAPInt().getLimitedValue()); if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest, SE.getMulExpr(Stride, SC))) ->isZero()) return nullptr; Dist = SC->getAPInt().getZExtValue(); - } + } else + Coeffs.push_back(0); // If the strides are not the same or repeated, we can't vectorize. if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size()) return nullptr; @@ -6965,23 +6968,124 @@ bool BoUpSLP::analyzeConstantStrideCandidate( } bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef PointerOps, - Type *ScalarTy, Align CommonAlignment, + Type *ElemTy, Align CommonAlignment, SmallVectorImpl &SortedIndices, StridedPtrInfo &SPtrInfo) const { + // Group the pointers by constant offset. + SmallDenseMap, SmallVector>> + OffsetToPointerOpIdxMap; + for (auto [Idx, Ptr] : enumerate(PointerOps)) { + const SCEV *PtrSCEV = SE->getSCEV(Ptr); + if (!PtrSCEV) + return false; + + const auto *Add = dyn_cast(PtrSCEV); + int64_t Offset = 0; + if (Add) { + for (int I : seq(Add->getNumOperands())) { + const auto *SC = dyn_cast(Add->getOperand(I)); + if (!SC) + continue; + Offset = SC->getAPInt().getSExtValue(); + break; + } + } + OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr); + OffsetToPointerOpIdxMap[Offset].second.push_back(Idx); + } + int NumOffsets = OffsetToPointerOpIdxMap.size(); + const unsigned Sz = PointerOps.size(); - // TODO: VecSz may change if we widen the strided load. unsigned VecSz = Sz; + Type *ScalarTy = ElemTy; + if (NumOffsets > 1) { + if (Sz % NumOffsets != 0) + return false; + VecSz = Sz / NumOffsets; + ScalarTy = Type::getIntNTy(SE->getContext(), + DL->getTypeSizeInBits(ElemTy).getFixedValue() * + NumOffsets); + } FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz); if (!(Sz > MinProfitableStridedLoads && TTI->isTypeLegal(StridedLoadTy) && TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))) return false; - if (const SCEV *Stride = - calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) { - SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size()); - SPtrInfo.StrideSCEV = Stride; - return true; + + SmallVector SortedOffsetsV; + for (auto [K, _] : OffsetToPointerOpIdxMap) + SortedOffsetsV.push_back(K); + sort(SortedOffsetsV); + if (NumOffsets > 1) { + int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0]; + if (CommonDiff != 1) + return false; + for (int I : seq(1, SortedOffsetsV.size() - 1)) { + if (SortedOffsetsV[I + 1] - SortedOffsetsV[I] != CommonDiff) + return false; + } } - return false; + + int64_t LowestOffset = SortedOffsetsV[0]; + SmallVector &PointerOps0 = + OffsetToPointerOpIdxMap[LowestOffset].first; + SmallVector &IndicesInAllPointerOps0 = + OffsetToPointerOpIdxMap[LowestOffset].second; + + SmallVector Coeffs0; + SmallVector SortedIndicesForOffset0; + const SCEV *Stride0 = calculateRtStride(PointerOps0, ElemTy, *DL, *SE, + SortedIndicesForOffset0, Coeffs0); + if (!Stride0) + return false; + unsigned NumCoeffs0 = Coeffs0.size(); + if (NumCoeffs0 * NumOffsets != Sz) + return false; + sort(Coeffs0); + + SmallVector SortedIndicesDraft; + SortedIndicesDraft.resize(Sz); + auto UpdateSortedIndices = + [&](SmallVectorImpl &SortedIndicesForOffset, + SmallVectorImpl &IndicesInAllPointerOps, + int64_t OffsetNum) { + for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) { + SortedIndicesDraft[Num * NumOffsets + OffsetNum] = + IndicesInAllPointerOps[Idx]; + } + }; + + UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0); + + SmallVector Coeffs; + SmallVector SortedIndicesForOffset; + for (int I : seq(1, NumOffsets)) { + Coeffs.clear(); + SortedIndicesForOffset.clear(); + + int64_t Offset = SortedOffsetsV[I]; + SmallVector &PointerOpsForOffset = + OffsetToPointerOpIdxMap[Offset].first; + SmallVector &IndicesInAllPointerOps = + OffsetToPointerOpIdxMap[Offset].second; + const SCEV *StrideWithinGroup = calculateRtStride( + PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs); + + if (!StrideWithinGroup || StrideWithinGroup != Stride0) + return false; + if (Coeffs.size() != NumCoeffs0) + return false; + sort(Coeffs); + if (Coeffs != Coeffs0) + return false; + + UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I); + } + + SortedIndices.clear(); + SortedIndices = SortedIndicesDraft; + SPtrInfo.StrideSCEV = Stride0; + SPtrInfo.Ty = StridedLoadTy; + return true; } BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll index 12725e7d46273..8f1561dec8ff4 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll @@ -332,11 +332,85 @@ define void @rt_stride_1_no_reordering(ptr %pl, i64 %stride, ptr %ps) { ; CHECK-LABEL: define void @rt_stride_1_no_reordering( ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[STRIDE0:%.*]] = mul nsw i64 [[STRIDE]], 0 +; CHECK-NEXT: [[STRIDE1:%.*]] = mul nsw i64 [[STRIDE]], 1 +; CHECK-NEXT: [[STRIDE2:%.*]] = mul nsw i64 [[STRIDE]], 2 +; CHECK-NEXT: [[STRIDE3:%.*]] = mul nsw i64 [[STRIDE]], 3 +; CHECK-NEXT: [[STRIDE4:%.*]] = mul nsw i64 [[STRIDE]], 4 +; CHECK-NEXT: [[STRIDE5:%.*]] = mul nsw i64 [[STRIDE]], 5 +; CHECK-NEXT: [[STRIDE6:%.*]] = mul nsw i64 [[STRIDE]], 6 +; CHECK-NEXT: [[STRIDE7:%.*]] = mul nsw i64 [[STRIDE]], 7 +; CHECK-NEXT: [[STRIDE8:%.*]] = mul nsw i64 [[STRIDE]], 8 +; CHECK-NEXT: [[STRIDE9:%.*]] = mul nsw i64 [[STRIDE]], 9 +; CHECK-NEXT: [[STRIDE10:%.*]] = mul nsw i64 [[STRIDE]], 10 +; CHECK-NEXT: [[STRIDE11:%.*]] = mul nsw i64 [[STRIDE]], 11 +; CHECK-NEXT: [[STRIDE12:%.*]] = mul nsw i64 [[STRIDE]], 12 +; CHECK-NEXT: [[STRIDE13:%.*]] = mul nsw i64 [[STRIDE]], 13 +; CHECK-NEXT: [[STRIDE14:%.*]] = mul nsw i64 [[STRIDE]], 14 +; CHECK-NEXT: [[STRIDE15:%.*]] = mul nsw i64 [[STRIDE]], 15 ; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE0]] +; CHECK-NEXT: [[GEP_L1:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE1]] +; CHECK-NEXT: [[GEP_L2:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE2]] +; CHECK-NEXT: [[GEP_L3:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE3]] +; CHECK-NEXT: [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE4]] +; CHECK-NEXT: [[GEP_L5:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE5]] +; CHECK-NEXT: [[GEP_L6:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE6]] +; CHECK-NEXT: [[GEP_L7:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE7]] +; CHECK-NEXT: [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE8]] +; CHECK-NEXT: [[GEP_L9:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE9]] +; CHECK-NEXT: [[GEP_L10:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE10]] +; CHECK-NEXT: [[GEP_L11:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE11]] +; CHECK-NEXT: [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE12]] +; CHECK-NEXT: [[GEP_L13:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE13]] +; CHECK-NEXT: [[GEP_L14:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE14]] +; CHECK-NEXT: [[GEP_L15:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE15]] +; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP_L0]], align 1 +; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP_L1]], align 1 +; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP_L2]], align 1 +; CHECK-NEXT: [[LOAD3:%.*]] = load i8, ptr [[GEP_L3]], align 1 +; CHECK-NEXT: [[LOAD4:%.*]] = load i8, ptr [[GEP_L4]], align 1 +; CHECK-NEXT: [[LOAD5:%.*]] = load i8, ptr [[GEP_L5]], align 1 +; CHECK-NEXT: [[LOAD6:%.*]] = load i8, ptr [[GEP_L6]], align 1 +; CHECK-NEXT: [[LOAD7:%.*]] = load i8, ptr [[GEP_L7]], align 1 +; CHECK-NEXT: [[LOAD8:%.*]] = load i8, ptr [[GEP_L8]], align 1 +; CHECK-NEXT: [[LOAD9:%.*]] = load i8, ptr [[GEP_L9]], align 1 +; CHECK-NEXT: [[LOAD10:%.*]] = load i8, ptr [[GEP_L10]], align 1 +; CHECK-NEXT: [[LOAD11:%.*]] = load i8, ptr [[GEP_L11]], align 1 +; CHECK-NEXT: [[LOAD12:%.*]] = load i8, ptr [[GEP_L12]], align 1 +; CHECK-NEXT: [[LOAD13:%.*]] = load i8, ptr [[GEP_L13]], align 1 +; CHECK-NEXT: [[LOAD14:%.*]] = load i8, ptr [[GEP_L14]], align 1 +; CHECK-NEXT: [[LOAD15:%.*]] = load i8, ptr [[GEP_L15]], align 1 ; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[STRIDE]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 1 [[GEP_L0]], i64 [[TMP1]], <16 x i1> splat (i1 true), i32 16) -; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1 +; CHECK-NEXT: [[GEP_S1:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 1 +; CHECK-NEXT: [[GEP_S2:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 2 +; CHECK-NEXT: [[GEP_S3:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 3 +; CHECK-NEXT: [[GEP_S4:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 4 +; CHECK-NEXT: [[GEP_S5:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 5 +; CHECK-NEXT: [[GEP_S6:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 6 +; CHECK-NEXT: [[GEP_S7:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 7 +; CHECK-NEXT: [[GEP_S8:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 8 +; CHECK-NEXT: [[GEP_S9:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 9 +; CHECK-NEXT: [[GEP_S10:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 10 +; CHECK-NEXT: [[GEP_S11:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 11 +; CHECK-NEXT: [[GEP_S12:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 12 +; CHECK-NEXT: [[GEP_S13:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 13 +; CHECK-NEXT: [[GEP_S14:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 14 +; CHECK-NEXT: [[GEP_S15:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 15 +; CHECK-NEXT: store i8 [[LOAD0]], ptr [[GEP_S0]], align 1 +; CHECK-NEXT: store i8 [[LOAD1]], ptr [[GEP_S1]], align 1 +; CHECK-NEXT: store i8 [[LOAD2]], ptr [[GEP_S2]], align 1 +; CHECK-NEXT: store i8 [[LOAD3]], ptr [[GEP_S3]], align 1 +; CHECK-NEXT: store i8 [[LOAD4]], ptr [[GEP_S4]], align 1 +; CHECK-NEXT: store i8 [[LOAD5]], ptr [[GEP_S5]], align 1 +; CHECK-NEXT: store i8 [[LOAD6]], ptr [[GEP_S6]], align 1 +; CHECK-NEXT: store i8 [[LOAD7]], ptr [[GEP_S7]], align 1 +; CHECK-NEXT: store i8 [[LOAD8]], ptr [[GEP_S8]], align 1 +; CHECK-NEXT: store i8 [[LOAD9]], ptr [[GEP_S9]], align 1 +; CHECK-NEXT: store i8 [[LOAD10]], ptr [[GEP_S10]], align 1 +; CHECK-NEXT: store i8 [[LOAD11]], ptr [[GEP_S11]], align 1 +; CHECK-NEXT: store i8 [[LOAD12]], ptr [[GEP_S12]], align 1 +; CHECK-NEXT: store i8 [[LOAD13]], ptr [[GEP_S13]], align 1 +; CHECK-NEXT: store i8 [[LOAD14]], ptr [[GEP_S14]], align 1 +; CHECK-NEXT: store i8 [[LOAD15]], ptr [[GEP_S15]], align 1 ; CHECK-NEXT: ret void ; %stride0 = mul nsw i64 %stride, 0 From 06c09829792a617c791b3f2618711451e79eaa5b Mon Sep 17 00:00:00 2001 From: Mikhail Gudim Date: Wed, 8 Oct 2025 12:43:56 -0700 Subject: [PATCH 9/9] fixed a bug added some comments added `const` in `UpdateSortedIndices`. updated tests --- .../Transforms/Vectorize/SLPVectorizer.cpp | 19 +++- .../RISCV/basic-strided-loads.ll | 100 ++---------------- .../SLPVectorizer/RISCV/x264-satd-8x4.ll | 68 +++--------- 3 files changed, 34 insertions(+), 153 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0b61e59e10546..d35893c3796c2 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6971,7 +6971,9 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef PointerOps, Type *ElemTy, Align CommonAlignment, SmallVectorImpl &SortedIndices, StridedPtrInfo &SPtrInfo) const { - // Group the pointers by constant offset. + // If each value in `PointerOps` is of the form `%x + Offset` where `Offset` + // is constant for each offset we record values from `PointerOps` and their + // indicies in `PointerOps`. SmallDenseMap, SmallVector>> OffsetToPointerOpIdxMap; for (auto [Idx, Ptr] : enumerate(PointerOps)) { @@ -7011,6 +7013,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef PointerOps, TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))) return false; + // Check if the offsets are contiguous. SmallVector SortedOffsetsV; for (auto [K, _] : OffsetToPointerOpIdxMap) SortedOffsetsV.push_back(K); @@ -7025,6 +7028,11 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef PointerOps, } } + // For the set of pointers with the same offset check that the distance + // between adjacent pointers are all equal to the same value (stride). As we + // do that, also calculate SortedIndices. Since we should not modify + // `SortedIndices` unless we know that all the checks succeede, record the + // indicies into `SortedIndicesDraft`. int64_t LowestOffset = SortedOffsetsV[0]; SmallVector &PointerOps0 = OffsetToPointerOpIdxMap[LowestOffset].first; @@ -7046,8 +7054,13 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef PointerOps, SortedIndicesDraft.resize(Sz); auto UpdateSortedIndices = [&](SmallVectorImpl &SortedIndicesForOffset, - SmallVectorImpl &IndicesInAllPointerOps, - int64_t OffsetNum) { + const SmallVectorImpl &IndicesInAllPointerOps, + const int64_t OffsetNum) { + if (SortedIndicesForOffset.empty()) { + SortedIndicesForOffset.resize(IndicesInAllPointerOps.size()); + std::iota(SortedIndicesForOffset.begin(), + SortedIndicesForOffset.end(), 0); + } for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) { SortedIndicesDraft[Num * NumOffsets + OffsetNum] = IndicesInAllPointerOps[Idx]; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll index 8f1561dec8ff4..5b57f2a6b2879 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll @@ -332,85 +332,11 @@ define void @rt_stride_1_no_reordering(ptr %pl, i64 %stride, ptr %ps) { ; CHECK-LABEL: define void @rt_stride_1_no_reordering( ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[STRIDE0:%.*]] = mul nsw i64 [[STRIDE]], 0 -; CHECK-NEXT: [[STRIDE1:%.*]] = mul nsw i64 [[STRIDE]], 1 -; CHECK-NEXT: [[STRIDE2:%.*]] = mul nsw i64 [[STRIDE]], 2 -; CHECK-NEXT: [[STRIDE3:%.*]] = mul nsw i64 [[STRIDE]], 3 -; CHECK-NEXT: [[STRIDE4:%.*]] = mul nsw i64 [[STRIDE]], 4 -; CHECK-NEXT: [[STRIDE5:%.*]] = mul nsw i64 [[STRIDE]], 5 -; CHECK-NEXT: [[STRIDE6:%.*]] = mul nsw i64 [[STRIDE]], 6 -; CHECK-NEXT: [[STRIDE7:%.*]] = mul nsw i64 [[STRIDE]], 7 -; CHECK-NEXT: [[STRIDE8:%.*]] = mul nsw i64 [[STRIDE]], 8 -; CHECK-NEXT: [[STRIDE9:%.*]] = mul nsw i64 [[STRIDE]], 9 -; CHECK-NEXT: [[STRIDE10:%.*]] = mul nsw i64 [[STRIDE]], 10 -; CHECK-NEXT: [[STRIDE11:%.*]] = mul nsw i64 [[STRIDE]], 11 -; CHECK-NEXT: [[STRIDE12:%.*]] = mul nsw i64 [[STRIDE]], 12 -; CHECK-NEXT: [[STRIDE13:%.*]] = mul nsw i64 [[STRIDE]], 13 -; CHECK-NEXT: [[STRIDE14:%.*]] = mul nsw i64 [[STRIDE]], 14 -; CHECK-NEXT: [[STRIDE15:%.*]] = mul nsw i64 [[STRIDE]], 15 ; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE0]] -; CHECK-NEXT: [[GEP_L1:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE1]] -; CHECK-NEXT: [[GEP_L2:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE2]] -; CHECK-NEXT: [[GEP_L3:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE3]] -; CHECK-NEXT: [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE4]] -; CHECK-NEXT: [[GEP_L5:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE5]] -; CHECK-NEXT: [[GEP_L6:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE6]] -; CHECK-NEXT: [[GEP_L7:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE7]] -; CHECK-NEXT: [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE8]] -; CHECK-NEXT: [[GEP_L9:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE9]] -; CHECK-NEXT: [[GEP_L10:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE10]] -; CHECK-NEXT: [[GEP_L11:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE11]] -; CHECK-NEXT: [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE12]] -; CHECK-NEXT: [[GEP_L13:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE13]] -; CHECK-NEXT: [[GEP_L14:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE14]] -; CHECK-NEXT: [[GEP_L15:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE15]] -; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP_L0]], align 1 -; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP_L1]], align 1 -; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP_L2]], align 1 -; CHECK-NEXT: [[LOAD3:%.*]] = load i8, ptr [[GEP_L3]], align 1 -; CHECK-NEXT: [[LOAD4:%.*]] = load i8, ptr [[GEP_L4]], align 1 -; CHECK-NEXT: [[LOAD5:%.*]] = load i8, ptr [[GEP_L5]], align 1 -; CHECK-NEXT: [[LOAD6:%.*]] = load i8, ptr [[GEP_L6]], align 1 -; CHECK-NEXT: [[LOAD7:%.*]] = load i8, ptr [[GEP_L7]], align 1 -; CHECK-NEXT: [[LOAD8:%.*]] = load i8, ptr [[GEP_L8]], align 1 -; CHECK-NEXT: [[LOAD9:%.*]] = load i8, ptr [[GEP_L9]], align 1 -; CHECK-NEXT: [[LOAD10:%.*]] = load i8, ptr [[GEP_L10]], align 1 -; CHECK-NEXT: [[LOAD11:%.*]] = load i8, ptr [[GEP_L11]], align 1 -; CHECK-NEXT: [[LOAD12:%.*]] = load i8, ptr [[GEP_L12]], align 1 -; CHECK-NEXT: [[LOAD13:%.*]] = load i8, ptr [[GEP_L13]], align 1 -; CHECK-NEXT: [[LOAD14:%.*]] = load i8, ptr [[GEP_L14]], align 1 -; CHECK-NEXT: [[LOAD15:%.*]] = load i8, ptr [[GEP_L15]], align 1 ; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0 -; CHECK-NEXT: [[GEP_S1:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 1 -; CHECK-NEXT: [[GEP_S2:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 2 -; CHECK-NEXT: [[GEP_S3:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 3 -; CHECK-NEXT: [[GEP_S4:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 4 -; CHECK-NEXT: [[GEP_S5:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 5 -; CHECK-NEXT: [[GEP_S6:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 6 -; CHECK-NEXT: [[GEP_S7:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 7 -; CHECK-NEXT: [[GEP_S8:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 8 -; CHECK-NEXT: [[GEP_S9:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 9 -; CHECK-NEXT: [[GEP_S10:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 10 -; CHECK-NEXT: [[GEP_S11:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 11 -; CHECK-NEXT: [[GEP_S12:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 12 -; CHECK-NEXT: [[GEP_S13:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 13 -; CHECK-NEXT: [[GEP_S14:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 14 -; CHECK-NEXT: [[GEP_S15:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 15 -; CHECK-NEXT: store i8 [[LOAD0]], ptr [[GEP_S0]], align 1 -; CHECK-NEXT: store i8 [[LOAD1]], ptr [[GEP_S1]], align 1 -; CHECK-NEXT: store i8 [[LOAD2]], ptr [[GEP_S2]], align 1 -; CHECK-NEXT: store i8 [[LOAD3]], ptr [[GEP_S3]], align 1 -; CHECK-NEXT: store i8 [[LOAD4]], ptr [[GEP_S4]], align 1 -; CHECK-NEXT: store i8 [[LOAD5]], ptr [[GEP_S5]], align 1 -; CHECK-NEXT: store i8 [[LOAD6]], ptr [[GEP_S6]], align 1 -; CHECK-NEXT: store i8 [[LOAD7]], ptr [[GEP_S7]], align 1 -; CHECK-NEXT: store i8 [[LOAD8]], ptr [[GEP_S8]], align 1 -; CHECK-NEXT: store i8 [[LOAD9]], ptr [[GEP_S9]], align 1 -; CHECK-NEXT: store i8 [[LOAD10]], ptr [[GEP_S10]], align 1 -; CHECK-NEXT: store i8 [[LOAD11]], ptr [[GEP_S11]], align 1 -; CHECK-NEXT: store i8 [[LOAD12]], ptr [[GEP_S12]], align 1 -; CHECK-NEXT: store i8 [[LOAD13]], ptr [[GEP_S13]], align 1 -; CHECK-NEXT: store i8 [[LOAD14]], ptr [[GEP_S14]], align 1 -; CHECK-NEXT: store i8 [[LOAD15]], ptr [[GEP_S15]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[STRIDE]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 1 [[GEP_L0]], i64 [[TMP1]], <16 x i1> splat (i1 true), i32 16) +; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1 ; CHECK-NEXT: ret void ; %stride0 = mul nsw i64 %stride, 0 @@ -784,25 +710,11 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) { ; CHECK-LABEL: define void @rt_stride_widen_no_reordering( ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[OFFSET0:%.*]] = mul nsw i64 [[STRIDE]], 0 -; CHECK-NEXT: [[OFFSET4:%.*]] = mul nsw i64 [[STRIDE]], 1 -; CHECK-NEXT: [[OFFSET8:%.*]] = mul nsw i64 [[STRIDE]], 2 -; CHECK-NEXT: [[OFFSET12:%.*]] = mul nsw i64 [[STRIDE]], 3 ; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET0]] -; CHECK-NEXT: [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET4]] -; CHECK-NEXT: [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET8]] -; CHECK-NEXT: [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET12]] ; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[STRIDE]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[GEP_L0]], i64 [[TMP1]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> ; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll index c1042f1842107..33249a8e66657 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll @@ -10,66 +10,22 @@ define i32 @x264_pixel_satd_8x4(ptr %pix1, i32 %i_pix1, ptr %pix2, i32 %i_pix ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[I_PIX2]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX2]], i64 4 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PIX1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[PIX2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_1]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_2]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_2]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> [[TMP18]], <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[IDX_EXT]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[PIX1]], i64 [[TMP0]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> ; CHECK-NEXT: [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[IDX_EXT63]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[PIX2]], i64 [[TMP4]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> ; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32> ; CHECK-NEXT: [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[IDX_EXT]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 [[TMP9]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> ; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP40]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[IDX_EXT63]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 [[TMP13]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP47:%.*]] = bitcast <4 x i32> [[TMP14]] to <16 x i8> ; CHECK-NEXT: [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32> ; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]] ; CHECK-NEXT: [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], splat (i32 16)