diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 25123ea5b87b7..64a515270fd57 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1473,11 +1473,11 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { bool UsesSecondVec = any_of(ConcatMask, [&](int M) { return M >= NumInputElts; }); InstructionCost OldCost = TTI.getShuffleCost( - UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, - Shuffle->getShuffleMask()); + UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, + UsesSecondVec ? VecType : ShuffleInputType, Shuffle->getShuffleMask()); InstructionCost NewCost = TTI.getShuffleCost( - UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, - ConcatMask); + UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, + UsesSecondVec ? VecType : ShuffleInputType, ConcatMask); LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle << "\n"); diff --git a/llvm/test/Transforms/VectorCombine/X86/reduction-two-vecs-combine.ll b/llvm/test/Transforms/VectorCombine/X86/reduction-two-vecs-combine.ll new file mode 100644 index 0000000000000..45ec3f511ebc8 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/reduction-two-vecs-combine.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -S --passes=vector-combine -mtriple=x86_64-unknown-linux < %s | FileCheck %s + +define i16 @test_spill_mixed() { +; CHECK-LABEL: define i16 @test_spill_mixed() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) +; CHECK-NEXT: ret i16 0 +; +entry: + %0 = shufflevector <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <4 x i32> + %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0) + ret i16 0 +} + +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)