diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 64b41bf9cefa8a..688ea8fa234338 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" @@ -33,6 +34,7 @@ using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "vector-combine" +STATISTIC(NumVecLoad, "Number of vector loads formed"); STATISTIC(NumVecCmp, "Number of vector compares formed"); STATISTIC(NumVecBO, "Number of vector binops formed"); STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); @@ -65,6 +67,7 @@ class VectorCombine { const TargetTransformInfo &TTI; const DominatorTree &DT; + bool vectorizeLoadInsert(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, ExtractElementInst *Ext1, unsigned PreferredExtractIndex) const; @@ -88,6 +91,61 @@ static void replaceValue(Value &Old, Value &New) { New.takeName(&Old); } +bool VectorCombine::vectorizeLoadInsert(Instruction &I) { + // Match insert of scalar load. + Value *Scalar; + if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt()))) + return false; + auto *Load = dyn_cast(Scalar); + Type *ScalarTy = Scalar->getType(); + if (!Load || !Load->isSimple()) + return false; + + // TODO: Extend this to match GEP with constant offsets. + Value *PtrOp = Load->getPointerOperand()->stripPointerCasts(); + assert(isa(PtrOp->getType()) && "Expected a pointer type"); + + unsigned VectorSize = TTI.getMinVectorRegisterBitWidth(); + uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); + if (!ScalarSize || VectorSize % ScalarSize != 0) + return false; + + // Check safety of replacing the scalar load with a larger vector load. + unsigned VecNumElts = VectorSize / ScalarSize; + auto *VectorTy = VectorType::get(ScalarTy, VecNumElts, false); + // TODO: Allow insert/extract subvector if the type does not match. + if (VectorTy != I.getType()) + return false; + Align Alignment = Load->getAlign(); + const DataLayout &DL = I.getModule()->getDataLayout(); + if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT)) + return false; + + // Original pattern: insertelt undef, load [free casts of] ScalarPtr, 0 + int OldCost = TTI.getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, + Load->getPointerAddressSpace()); + APInt DemandedElts = APInt::getOneBitSet(VecNumElts, 0); + OldCost += TTI.getScalarizationOverhead(VectorTy, DemandedElts, true, false); + + // New pattern: load VecPtr + int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment, + Load->getPointerAddressSpace()); + + // We can aggressively convert to the vector form because the backend can + // invert this transform if it does not result in a performance win. + if (OldCost < NewCost) + return false; + + // It is safe and potentially profitable to load a vector directly: + // inselt undef, load Scalar, 0 --> load VecPtr + IRBuilder<> Builder(Load); + Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo()); + LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment); + replaceValue(I, *VecLd); + ++NumVecLoad; + return true; +} + /// Determine which, if any, of the inputs should be replaced by a shuffle /// followed by extract from a different index. ExtractElementInst *VectorCombine::getShuffleExtract( @@ -625,6 +683,7 @@ bool VectorCombine::run() { if (isa(I)) continue; Builder.SetInsertPoint(&I); + MadeChange |= vectorizeLoadInsert(I); MadeChange |= foldExtractExtract(I); MadeChange |= foldBitcastShuf(I); MadeChange |= scalarizeBinopOrCmp(I); diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll index 6f229044cf568a..edd1e4af099edf 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load.ll @@ -174,8 +174,8 @@ define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_f32_insert_v4f32( -; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -185,9 +185,7 @@ define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) { ; CHECK-LABEL: @casted_load_f32_insert_v4f32( -; CHECK-NEXT: [[B:%.*]] = bitcast <4 x float>* [[P:%.*]] to float* -; CHECK-NEXT: [[S:%.*]] = load float, float* [[B]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 +; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4 ; CHECK-NEXT: ret <4 x float> [[R]] ; %b = bitcast <4 x float>* %p to float* @@ -196,10 +194,12 @@ define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenc ret <4 x float> %r } +; Element type does not change cost. + define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_i32_insert_v4i32( -; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* +; CHECK-NEXT: [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: ret <4 x i32> [[R]] ; %s = load i32, i32* %p, align 4 @@ -207,11 +207,12 @@ define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) { ret <4 x i32> %r } +; Pointer type does not change cost. + define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) { ; CHECK-LABEL: @casted_load_i32_insert_v4i32( -; CHECK-NEXT: [[B:%.*]] = bitcast <16 x i8>* [[P:%.*]] to i32* -; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[B]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>* +; CHECK-NEXT: [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: ret <4 x i32> [[R]] ; %b = bitcast <16 x i8>* %p to i32* @@ -220,11 +221,11 @@ define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceabl ret <4 x i32> %r } +; This is canonical form for vector element access. + define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0 -; CHECK-NEXT: [[S:%.*]] = load float, float* [[GEP]], align 16 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0 +; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16 ; CHECK-NEXT: ret <4 x float> [[R]] ; %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0 @@ -233,11 +234,13 @@ define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenc ret <4 x float> %r } +; If there are enough dereferenceable bytes, we can offset the vector load. + define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) { ; CHECK-LABEL: @gep01_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>* +; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 @@ -246,6 +249,8 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl ret <8 x i16> %r } +; Negative test - can't safely load the offset vector, but could load+shuffle. + define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) { ; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 @@ -259,11 +264,13 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 derefere ret <8 x i16> %r } +; If there are enough dereferenceable bytes, we can offset the vector load. + define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) { ; CHECK-LABEL: @gep10_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 -; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 16 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>* +; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16 ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 @@ -272,6 +279,8 @@ define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl ret <8 x i16> %r } +; Negative test - can't safely load the offset vector, but could load+shuffle. + define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(31) %p) { ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 @@ -285,6 +294,8 @@ define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 derefere ret <8 x i16> %r } +; Negative test - do not alter volatile. + define <4 x float> @load_f32_insert_v4f32_volatile(float* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_f32_insert_v4f32_volatile( ; CHECK-NEXT: [[S:%.*]] = load volatile float, float* [[P:%.*]], align 4 @@ -296,6 +307,8 @@ define <4 x float> @load_f32_insert_v4f32_volatile(float* align 16 dereferenceab ret <4 x float> %r } +; Negative test? - pointer is not as aligned as load. + define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) { ; CHECK-LABEL: @load_f32_insert_v4f32_align( ; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4 @@ -307,6 +320,8 @@ define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(1 ret <4 x float> %r } +; Negative test - not enough bytes. + define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(15) %p) { ; CHECK-LABEL: @load_f32_insert_v4f32_deref( ; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4 @@ -318,6 +333,8 @@ define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(1 ret <4 x float> %r } +; TODO: Should load v4i32. + define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_i32_insert_v8i32( ; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[P:%.*]], align 4 @@ -329,6 +346,8 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) { ret <8 x i32> %r } +; TODO: Should load v4i32. + define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) { ; CHECK-LABEL: @casted_load_i32_insert_v8i32( ; CHECK-NEXT: [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32*