diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index 2cb78904dd799..8704f39436364 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -2781,6 +2781,14 @@ m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2, return m_Intrinsic(Op0, Op1, Op2, Op3); } +/// Matches MaskedStore Intrinsic. +template +inline typename m_Intrinsic_Ty::Ty +m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2, + const Opnd3 &Op3) { + return m_Intrinsic(Op0, Op1, Op2, Op3); +} + /// Matches MaskedGather Intrinsic. template inline typename m_Intrinsic_Ty::Ty diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h index 245414935bc0f..74a4d6ce00fcc 100644 --- a/llvm/include/llvm/Transforms/Scalar/GVN.h +++ b/llvm/include/llvm/Transforms/Scalar/GVN.h @@ -56,6 +56,7 @@ class OptimizationRemarkEmitter; class PHINode; class TargetLibraryInfo; class Value; +class IntrinsicInst; /// A private "module" namespace for types and utilities used by GVN. These /// are implementation details and should not be used by clients. namespace LLVM_LIBRARY_VISIBILITY_NAMESPACE gvn { @@ -349,6 +350,7 @@ class GVNPass : public PassInfoMixin { // Helper functions of redundant load elimination. bool processLoad(LoadInst *L); + bool processMaskedLoad(IntrinsicInst *I); bool processNonLocalLoad(LoadInst *L); bool processAssumeIntrinsic(AssumeInst *II); diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 26e17cc849bff..b9b5b5823d780 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2287,6 +2287,35 @@ bool GVNPass::processLoad(LoadInst *L) { return true; } +// Attempt to process masked loads which have loaded from +// masked stores with the same mask +bool GVNPass::processMaskedLoad(IntrinsicInst *I) { + if (!MD) + return false; + MemDepResult Dep = MD->getDependency(I); + Instruction *DepInst = Dep.getInst(); + if (!DepInst || !Dep.isLocal() || !Dep.isDef()) + return false; + + Value *Mask = I->getOperand(2); + Value *Passthrough = I->getOperand(3); + Value *StoreVal; + if (!match(DepInst, m_MaskedStore(m_Value(StoreVal), m_Value(), m_Value(), + m_Specific(Mask))) || + StoreVal->getType() != I->getType()) + return false; + + // Remove the load but generate a select for the passthrough + Value *OpToForward = llvm::SelectInst::Create(Mask, StoreVal, Passthrough, "", + I->getIterator()); + + ICF->removeUsersOf(I); + I->replaceAllUsesWith(OpToForward); + salvageAndRemoveInstruction(I); + ++NumGVNLoad; + return true; +} + /// Return a pair the first field showing the value number of \p Exp and the /// second field showing whether it is a value number newly created. std::pair @@ -2734,6 +2763,10 @@ bool GVNPass::processInstruction(Instruction *I) { return false; } + if (match(I, m_Intrinsic()) && + processMaskedLoad(cast(I))) + return true; + // For conditional branches, we can perform simple conditional propagation on // the condition value itself. if (BranchInst *BI = dyn_cast(I)) { diff --git a/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll new file mode 100644 index 0000000000000..512ea37641ab9 --- /dev/null +++ b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=gvn -S -enable-gvn-memdep=true < %s | FileCheck %s +; RUN: opt -passes=gvn -S -enable-gvn-memdep=false < %s | FileCheck %s --check-prefix=MEMDEPFALSE + +define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) { +; CHECK-LABEL: @forward_binop_with_sel( +; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]]) +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP3]] +; +; MEMDEPFALSE-LABEL: @forward_binop_with_sel( +; MEMDEPFALSE-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; MEMDEPFALSE-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; MEMDEPFALSE-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; MEMDEPFALSE-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; MEMDEPFALSE-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; MEMDEPFALSE-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]]) +; MEMDEPFALSE-NEXT: [[LOAD_1_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 1, <4 x i1> [[MASK]], <4 x float> [[PASSTHROUGH:%.*]]) +; MEMDEPFALSE-NEXT: ret <4 x float> [[LOAD_1_0]] +; + %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b) + %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %fmul = fmul <4 x float> %load.0.0, %load.0.16 + call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask) + %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough) + ret <4 x float> %load.1.0 +} diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll index 984a756591701..b112e990e0c58 100644 --- a/llvm/test/Transforms/GVN/masked-load-store.ll +++ b/llvm/test/Transforms/GVN/masked-load-store.ll @@ -36,6 +36,180 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) { ret <128 x i8> %v4 } -declare <128 x i8> @llvm.masked.load.v128i8.p0(ptr, i32, <128 x i1>, <128 x i8>) -declare void @llvm.masked.store.v128i8.p0(<128 x i8>, ptr, i32, <128 x i1>) +define <4 x float> @forward_masked_load(ptr %0, ptr %1) { +; CHECK-LABEL: @forward_masked_load( +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true)) +; CHECK-NEXT: ret <4 x float> [[TMP4]] +; + %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4) + %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + call void @llvm.masked.store.v4f32.p0(<4 x float> %load1, ptr %1, i32 1, <4 x i1> %mask) + %load2 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + ret <4 x float> %load2 +} + +define <4 x float> @forward_masked_load_arbitrary_mask(ptr %loc_a, ptr %loc_b, <4 x i1> %mask) { +; CHECK-LABEL: @forward_masked_load_arbitrary_mask( +; CHECK-NEXT: [[LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[LOC_A:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD1]], ptr [[LOC_B:%.*]], i32 1, <4 x i1> [[MASK]]) +; CHECK-NEXT: [[TMP1:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[LOAD1]], <4 x float> zeroinitializer +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %loc_a, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + call void @llvm.masked.store.v4f32.p0(<4 x float> %load1, ptr %loc_b, i32 1, <4 x i1> %mask) + %load2 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %loc_b, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + ret <4 x float> %load2 +} + +define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) { +; CHECK-LABEL: @forward_binop_splat_i1_mask( +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true)) +; CHECK-NEXT: ret <4 x float> [[FMUL]] +; + %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4) + %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %fmul = fmul <4 x float> %load.0.0, %load.0.16 + call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask) + %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + ret <4 x float> %load.1.0 +} + +define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) { +; CHECK-LABEL: @forward_binop_with_sel( +; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]]) +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP3]] +; + %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b) + %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %fmul = fmul <4 x float> %load.0.0, %load.0.16 + call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask) + %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough) + ret <4 x float> %load.1.0 +} + +define @forward_masked_load_scalable(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @forward_masked_load_scalable( +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, [[TMP3]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP4]], ptr [[TMP1:%.*]], i32 1, [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = select [[TMP3]], [[TMP4]], [[PASSTHROUGH]] +; CHECK-NEXT: ret [[TMP5]] +; + %mask = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load1 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %mask, %passthrough) + call void @llvm.masked.store.nxv4f32.p0( %load1, ptr %1, i32 1, %mask) + %load2 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask, %passthrough) + ret %load2 +} +define @forward_masked_load_scalable_type_mismatch(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @forward_masked_load_scalable_type_mismatch( +; CHECK-NEXT: [[MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[LOAD1:%.*]] = call @llvm.masked.load.nxv4f64.p0(ptr [[TMP0:%.*]], i32 1, [[MASK]], zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.nxv4f64.p0( [[LOAD1]], ptr [[TMP1:%.*]], i32 1, [[MASK]]) +; CHECK-NEXT: [[LOAD2:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, [[MASK]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: ret [[LOAD2]] +; + %mask = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load1 = call @llvm.masked.load.nxv4f64.p0(ptr %0, i32 1, %mask, zeroinitializer) + call void @llvm.masked.store.nxv4f64.p0( %load1, ptr %1, i32 1, %mask) + %load2 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask, %passthrough) + ret %load2 +} + +define @generate_sel_with_passthrough(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @generate_sel_with_passthrough( +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, [[TMP3]], zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP4]], ptr [[TMP1:%.*]], i32 1, [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = select [[TMP3]], [[TMP4]], [[PASSTHROUGH:%.*]] +; CHECK-NEXT: ret [[TMP5]] +; + %mask = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load1 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %mask, zeroinitializer) + call void @llvm.masked.store.nxv4f32.p0( %load1, ptr %1, i32 1, %mask) + %load2 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask, %passthrough) + ret %load2 +} + +define @forward_binop_with_sel_scalable(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @forward_binop_with_sel_scalable( +; CHECK-NEXT: [[MASK:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, [[MASK]], zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, [[MASK]], zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[FMUL]], ptr [[TMP1:%.*]], i32 1, [[MASK]]) +; CHECK-NEXT: [[TMP3:%.*]] = select [[MASK]], [[FMUL]], [[PASSTHROUGH:%.*]] +; CHECK-NEXT: ret [[TMP3]] +; + %mask = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load.0.0 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %mask, zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, %mask, zeroinitializer) + %fmul = fmul %load.0.0, %load.0.16 + call void @llvm.masked.store.nxv4f32.p0( %fmul, ptr %1, i32 1, %mask) + %load.1.0 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask, %passthrough) + ret %load.1.0 +} + +define @load_mask_differs(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @load_mask_differs( +; CHECK-NEXT: [[MASK0:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) +; CHECK-NEXT: [[MASK1:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, [[MASK0]], zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, [[MASK0]], zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[FMUL]], ptr [[TMP1:%.*]], i32 1, [[MASK0]]) +; CHECK-NEXT: [[LOAD_1_0:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, [[MASK1]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: ret [[LOAD_1_0]] +; + %mask0 = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) + %mask1 = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load.0.0 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %mask0, zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, %mask0, zeroinitializer) + %fmul = fmul %load.0.0, %load.0.16 + call void @llvm.masked.store.nxv4f32.p0( %fmul, ptr %1, i32 1, %mask0) + %load.1.0 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask1, %passthrough) + ret %load.1.0 +} + +define @store_mask_differs(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @store_mask_differs( +; CHECK-NEXT: [[MASK0:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) +; CHECK-NEXT: [[MASK1:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, [[MASK0]], zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, [[MASK0]], zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[FMUL]], ptr [[TMP1:%.*]], i32 1, [[MASK1]]) +; CHECK-NEXT: [[LOAD_1_0:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, [[MASK0]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: ret [[LOAD_1_0]] +; + %mask0 = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) + %mask1 = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load.0.0 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %mask0, zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, %mask0, zeroinitializer) + %fmul = fmul %load.0.0, %load.0.16 + call void @llvm.masked.store.nxv4f32.p0( %fmul, ptr %1, i32 1, %mask1) + %load.1.0 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask0, %passthrough) + ret %load.1.0 +}