diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5ff2cd18c73c8b..000bd863a7c54f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3694,11 +3694,13 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI) { // Look past the root to find a source value. Arbitrarily follow the // path through operand 0 of any 'or'. Also, peek through optional - // shift-left-by-constant. + // shift-left-by-multiple-of-8-bits. Value *ZextLoad = Root; + const APInt *ShAmtC; while (!isa(ZextLoad) && (match(ZextLoad, m_Or(m_Value(), m_Value())) || - match(ZextLoad, m_Shl(m_Value(), m_Constant())))) + (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && + ShAmtC->urem(8) == 0))) ZextLoad = cast(ZextLoad)->getOperand(0); // Check if the input is an extended load of the required or/shift expression. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll index c78bec1b6a20be..e1028cf5527621 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll @@ -545,10 +545,11 @@ define void @PR47450(i16* nocapture readonly %p) { ; CHECK-NEXT: [[X:%.*]] = load i16, i16* [[P:%.*]], align 2 ; CHECK-NEXT: [[Z:%.*]] = zext i16 [[X]] to i32 ; CHECK-NEXT: [[S:%.*]] = shl nuw nsw i32 [[Z]], 1 -; CHECK-NEXT: store i32 [[S]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 0), align 16 -; CHECK-NEXT: store i32 [[S]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[S]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 2), align 8 -; CHECK-NEXT: store i32 [[S]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @output, i64 0, i64 3), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[S]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[S]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[S]], i32 3 +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([8 x i32]* @output to <4 x i32>*), align 16 ; CHECK-NEXT: ret void ; %x = load i16, i16* %p, align 2