diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index 753c656007703..0efff743355b4 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -465,6 +465,21 @@ CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, VPI.getName()); break; } + case Intrinsic::vp_load_ff: { + // Mask out all but the first lane. + Value *NewMask = ConstantInt::getFalse(MaskParam->getType()); + NewMask = Builder.CreateInsertElement( + NewMask, ConstantInt::getTrue(MaskParam->getType()->getScalarType()), + (uint64_t)0); + NewMask = Builder.CreateAnd(MaskParam, NewMask); + Value *MaskedLoad = Builder.CreateMaskedLoad( + VPI.getType()->subtypes()[0], PtrParam, AlignOpt.valueOrOne(), NewMask); + Value *EVLResult = Builder.getInt32(1); + Value *InsertValue = Builder.CreateInsertValue( + PoisonValue::get(VPI.getType()), MaskedLoad, 0); + NewMemoryInst = Builder.CreateInsertValue(InsertValue, EVLResult, 1); + break; + } } assert(NewMemoryInst); @@ -609,6 +624,7 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { case Intrinsic::vp_store: case Intrinsic::vp_gather: case Intrinsic::vp_scatter: + case Intrinsic::vp_load_ff: return expandPredicationInMemoryIntrinsic(Builder, VPI); } diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll index 81923642811d9..721dd1bd1acc1 100644 --- a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll @@ -91,6 +91,61 @@ define void @vpstore_v2i64_allones_mask_vlmax(<2 x i64> %val, ptr %ptr) { ret void } +define { <2 x i64>, i32 } @vpload_ff_v2i64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: @vpload_ff_v2i64( +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i1> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP3]], <2 x i64> poison) +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <2 x i64>, i32 } poison, <2 x i64> [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { <2 x i64>, i32 } [[TMP5]], i32 1, 1 +; CHECK-NEXT: ret { <2 x i64>, i32 } [[TMP6]] +; + %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x i64>, i32 } %load +} + +define { <2 x i64>, i32 } @vpload_ff_v2i64_vlmax(ptr %ptr, <2 x i1> %m) { +; CHECK-LABEL: @vpload_ff_v2i64_vlmax( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i1> [[M:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP1]], <2 x i64> poison) +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <2 x i64>, i32 } poison, <2 x i64> [[TMP2]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertvalue { <2 x i64>, i32 } [[TMP3]], i32 1, 1 +; CHECK-NEXT: ret { <2 x i64>, i32 } [[TMP4]] +; + %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 2) + ret { <2 x i64>, i32 } %load +} + +define { <2 x i64>, i32 } @vpload_ff_v2i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: @vpload_ff_v2i64_allones_mask( +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], splat (i1 true) +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i1> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP3]], <2 x i64> poison) +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <2 x i64>, i32 } poison, <2 x i64> [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { <2 x i64>, i32 } [[TMP5]], i32 1, 1 +; CHECK-NEXT: ret { <2 x i64>, i32 } [[TMP6]] +; + %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> , i32 %evl) + ret { <2 x i64>, i32 } %load +} + +define { <2 x i64>, i32 } @vpload_ff_v2i64_allones_mask_vlmax(ptr %ptr) { +; CHECK-LABEL: @vpload_ff_v2i64_allones_mask_vlmax( +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> , <2 x i64> poison) +; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <2 x i64>, i32 } poison, <2 x i64> [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <2 x i64>, i32 } [[TMP2]], i32 1, 1 +; CHECK-NEXT: ret { <2 x i64>, i32 } [[TMP3]] +; + %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> , i32 2) + ret { <2 x i64>, i32 } %load +} + ; Scalable vectors define @vpload_nxv1i64(ptr %ptr, %m, i32 zeroext %evl) { ; CHECK-LABEL: @vpload_nxv1i64( @@ -196,6 +251,69 @@ define void @vpstore_nxv1i64_allones_mask_vscale( %val, ptr %p ret void } +define { , i32 } @vpload_ff_nxv1i64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: @vpload_ff_nxv1i64( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], [[M:%.*]] +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = and [[TMP2]], insertelement ( zeroinitializer, i1 true, i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, [[TMP6]], poison) +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , i32 } poison, [[TMP4]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , i32 } [[TMP5]], i32 1, 1 +; CHECK-NEXT: ret { , i32 } [[TMP3]] +; + %load = call { , i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vpload_ff_nxv1i64_vscale(ptr %ptr, %m) { +; CHECK-LABEL: @vpload_ff_nxv1i64_vscale( +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = and [[M:%.*]], insertelement ( zeroinitializer, i1 true, i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, [[TMP4]], poison) +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , i32 } poison, [[TMP2]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , i32 } [[TMP3]], i32 1, 1 +; CHECK-NEXT: ret { , i32 } [[TMP1]] +; + %vscale = call i32 @llvm.vscale.i32() + %vlmax = mul nuw i32 %vscale, 1 + %load = call { , i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, %m, i32 %vlmax) + ret { , i32 } %load +} + +define { , i32 } @vpload_ff_nxv1i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: @vpload_ff_nxv1i64_allones_mask( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], splat (i1 true) +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = and [[TMP2]], insertelement ( zeroinitializer, i1 true, i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, [[TMP6]], poison) +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , i32 } poison, [[TMP4]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , i32 } [[TMP5]], i32 1, 1 +; CHECK-NEXT: ret { , i32 } [[TMP3]] +; + %load = call { , i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vpload_ff_nxv1i64_allones_mask_vscale(ptr %ptr) { +; CHECK-LABEL: @vpload_ff_nxv1i64_allones_mask_vscale( +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, insertelement ( zeroinitializer, i1 true, i64 0), poison) +; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , i32 } poison, [[TMP3]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , i32 } [[TMP2]], i32 1, 1 +; CHECK-NEXT: ret { , i32 } [[TMP1]] +; + %vscale = call i32 @llvm.vscale.i32() + %vlmax = mul nuw i32 %vscale, 1 + %load = call { , i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, splat (i1 true), i32 %vlmax) + ret { , i32 } %load +} + declare i32 @llvm.vscale.i32() declare <2 x i64> @llvm.vp.load.v2i64.p0(ptr, <2 x i1>, i32)