diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index bf3f52c51b64c..df835a077f2a0 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20996,6 +20996,15 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, return false; })) return std::nullopt; + if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() && + EI.UserTE->hasCopyableElements() && + EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() && + all_of(VL, [&](Value *V) { + if (S.isCopyableElement(V)) + return true; + return isUsedOutsideBlock(V); + })) + return std::nullopt; bool HasCopyables = S.areInstructionsWithCopyableElements(); if (((!HasCopyables && doesNotNeedToSchedule(VL)) || all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll b/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll new file mode 100644 index 0000000000000..65975199e46b8 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define <4 x i32> @test() { +; CHECK-LABEL: define <4 x i32> @test() { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 0 to i32 +; CHECK-NEXT: br label %[[BB1:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[OR:%.*]] = or i32 [[TRUNC]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[TRUNC]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 0, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[OR]] to i64 +; CHECK-NEXT: br label %[[BB3:.*]] +; CHECK: [[BB3]]: +; CHECK-NEXT: ret <4 x i32> [[TMP3]] +; +bb: + %trunc = trunc i64 0 to i32 + br label %bb1 + +bb1: + %or = or i32 %trunc, 0 + %zext = zext i32 %or to i64 + %and = and i32 0, 0 + %or2 = or i32 %trunc, 0 + br label %bb3 + +bb3: + %0 = insertelement <4 x i32> zeroinitializer, i32 %trunc, i32 0 + %1 = insertelement <4 x i32> %0, i32 %and, i32 1 + %2 = insertelement <4 x i32> %1, i32 %or2, i32 2 + %3 = insertelement <4 x i32> %2, i32 %or, i32 3 + ret <4 x i32> %3 +}