From 11ac166c1a8b1ebccb528094fe768aaa93668f1d Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Fri, 5 Sep 2025 11:54:10 +0000 Subject: [PATCH 1/6] [GVN] Teach GVN simple masked load/store forwarding This patch teaches GVN how to eliminate redundant masked loads and forward previous loads or instructions with a select. This is possible when the same mask is used for masked stores/loads that write to the same memory location --- llvm/include/llvm/Transforms/Scalar/GVN.h | 2 + llvm/lib/Transforms/Scalar/GVN.cpp | 50 ++++++ llvm/test/Transforms/GVN/masked-load-store.ll | 158 ++++++++++++++++++ 3 files changed, 210 insertions(+) diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h index 245414935bc0f..74a4d6ce00fcc 100644 --- a/llvm/include/llvm/Transforms/Scalar/GVN.h +++ b/llvm/include/llvm/Transforms/Scalar/GVN.h @@ -56,6 +56,7 @@ class OptimizationRemarkEmitter; class PHINode; class TargetLibraryInfo; class Value; +class IntrinsicInst; /// A private "module" namespace for types and utilities used by GVN. These /// are implementation details and should not be used by clients. namespace LLVM_LIBRARY_VISIBILITY_NAMESPACE gvn { @@ -349,6 +350,7 @@ class GVNPass : public PassInfoMixin { // Helper functions of redundant load elimination. bool processLoad(LoadInst *L); + bool processMaskedLoad(IntrinsicInst *I); bool processNonLocalLoad(LoadInst *L); bool processAssumeIntrinsic(AssumeInst *II); diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 26e17cc849bff..10325ab7c5737 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -50,6 +50,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -2287,6 +2288,50 @@ bool GVNPass::processLoad(LoadInst *L) { return true; } +// Attempt to process masked loads which have loaded from +// masked stores with the same mask +bool GVNPass::processMaskedLoad(IntrinsicInst *I) { + Value *Mask = I->getOperand(2); + Value *Passthrough = I->getOperand(3); + + MemDepResult Dep = MD->getDependency(I); + Instruction *DepInst = Dep.getInst(); + if (!DepInst || !Dep.isLocal()) + return false; + + auto *MaskedStore = dyn_cast(DepInst); + if (!MaskedStore || MaskedStore->getIntrinsicID() != Intrinsic::masked_store) + return false; + + auto StoreMask = MaskedStore->getOperand(3); + if (StoreMask != Mask) + return false; + + Value *OpToForward = + AvailableValue::get(MaskedStore->getOperand(0)).getSimpleValue(); + if (auto *LoadToForward = dyn_cast(OpToForward); + LoadToForward && + LoadToForward->getIntrinsicID() == Intrinsic::masked_load) { + // For MaskedLoad->MaskedStore->MaskedLoad, the mask must be the same for + // all three instructions. The Passthrough on the two loads must also be the + // same. + if (LoadToForward->getOperand(2) != Mask || + LoadToForward->getOperand(3) != Passthrough) + return false; + } else { + // MaskedStore(Op, ptr, mask)->MaskedLoad(ptr, mask, passthrough) can be + // replaced with MaskedStore(Op, ptr, mask)->select(mask, Op, passthrough) + IRBuilder<> Builder(I); + OpToForward = Builder.CreateSelect(StoreMask, OpToForward, Passthrough); + } + + I->replaceAllUsesWith(OpToForward); + ICF->removeUsersOf(I); + salvageAndRemoveInstruction(I); + ++NumGVNLoad; + return true; +} + /// Return a pair the first field showing the value number of \p Exp and the /// second field showing whether it is a value number newly created. std::pair @@ -2734,6 +2779,11 @@ bool GVNPass::processInstruction(Instruction *I) { return false; } + if (auto *II = dyn_cast(I)) + if (II && II->getIntrinsicID() == Intrinsic::masked_load) + if (processMaskedLoad(II)) + return true; + // For conditional branches, we can perform simple conditional propagation on // the condition value itself. if (BranchInst *BI = dyn_cast(I)) { diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll index 984a756591701..b32279941d0b0 100644 --- a/llvm/test/Transforms/GVN/masked-load-store.ll +++ b/llvm/test/Transforms/GVN/masked-load-store.ll @@ -36,6 +36,164 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) { ret <128 x i8> %v4 } +define <4 x float> @forward_masked_load(ptr %0, ptr %1) { +; CHECK-LABEL: @forward_masked_load( +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true)) +; CHECK-NEXT: ret <4 x float> [[TMP4]] +; + %6 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4) + %7 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %6, <4 x float> zeroinitializer) + call void @llvm.masked.store.v4f32.p0(<4 x float> %7, ptr %1, i32 1, <4 x i1> %6) + %8 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %6, <4 x float> zeroinitializer) + ret <4 x float> %8 +} + +define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) { +; CHECK-LABEL: @forward_binop_splat_i1_mask( +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true)) +; CHECK-NEXT: ret <4 x float> [[FMUL]] +; + %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4) + %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %fmul = fmul <4 x float> %load.0.0, %load.0.16 + call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask) + %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + ret <4 x float> %load.1.0 +} + +define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) { +; CHECK-LABEL: @forward_binop_with_sel( +; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]]) +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP3]] +; + %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b) + %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %fmul = fmul <4 x float> %load.0.0, %load.0.16 + call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask) + %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough) + ret <4 x float> %load.1.0 +} + +define @forward_masked_load_scalable(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @forward_masked_load_scalable( +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, [[TMP3]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP4]], ptr [[TMP1:%.*]], i32 1, [[TMP3]]) +; CHECK-NEXT: ret [[TMP4]] +; + %6 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %7 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %6, %passthrough) + call void @llvm.masked.store.nxv4f32.p0( %7, ptr %1, i32 1, %6) + %8 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %6, %passthrough) + ret %8 +} + +define @bail_on_different_passthrough(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @bail_on_different_passthrough( +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, [[TMP3]], zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP4]], ptr [[TMP1:%.*]], i32 1, [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, [[TMP3]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: ret [[TMP5]] +; + %6 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %7 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %6, zeroinitializer) + call void @llvm.masked.store.nxv4f32.p0( %7, ptr %1, i32 1, %6) + %8 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %6, %passthrough) + ret %8 +} + +define @forward_binop_with_sel_scalable(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @forward_binop_with_sel_scalable( +; CHECK-NEXT: [[MASK:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, [[MASK]], zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, [[MASK]], zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[FMUL]], ptr [[TMP1:%.*]], i32 1, [[MASK]]) +; CHECK-NEXT: [[TMP3:%.*]] = select [[MASK]], [[FMUL]], [[PASSTHROUGH:%.*]] +; CHECK-NEXT: ret [[TMP3]] +; + %mask = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load.0.0 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %mask, zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, %mask, zeroinitializer) + %fmul = fmul %load.0.0, %load.0.16 + call void @llvm.masked.store.nxv4f32.p0( %fmul, ptr %1, i32 1, %mask) + %load.1.0 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask, %passthrough) + ret %load.1.0 +} + +define @load_mask_differs(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @load_mask_differs( +; CHECK-NEXT: [[MASK0:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) +; CHECK-NEXT: [[MASK1:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, [[MASK0]], zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, [[MASK0]], zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[FMUL]], ptr [[TMP1:%.*]], i32 1, [[MASK0]]) +; CHECK-NEXT: [[LOAD_1_0:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, [[MASK1]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: ret [[LOAD_1_0]] +; + %mask0 = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) + %mask1 = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load.0.0 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %mask0, zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, %mask0, zeroinitializer) + %fmul = fmul %load.0.0, %load.0.16 + call void @llvm.masked.store.nxv4f32.p0( %fmul, ptr %1, i32 1, %mask0) + %load.1.0 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask1, %passthrough) + ret %load.1.0 +} + +define @store_mask_differs(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @store_mask_differs( +; CHECK-NEXT: [[MASK0:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) +; CHECK-NEXT: [[MASK1:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, [[MASK0]], zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, [[MASK0]], zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[FMUL]], ptr [[TMP1:%.*]], i32 1, [[MASK1]]) +; CHECK-NEXT: [[LOAD_1_0:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, [[MASK0]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: ret [[LOAD_1_0]] +; + %mask0 = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) + %mask1 = tail call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load.0.0 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %mask0, zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, %mask0, zeroinitializer) + %fmul = fmul %load.0.0, %load.0.16 + call void @llvm.masked.store.nxv4f32.p0( %fmul, ptr %1, i32 1, %mask1) + %load.1.0 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask0, %passthrough) + ret %load.1.0 +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.masked.load.nxv4f32.p0(ptr captures(none), i32 immarg, , ) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.masked.store.nxv4f32.p0(, ptr captures(none), i32 immarg, ) #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32) #3 + declare <128 x i8> @llvm.masked.load.v128i8.p0(ptr, i32, <128 x i1>, <128 x i8>) declare void @llvm.masked.store.v128i8.p0(<128 x i8>, ptr, i32, <128 x i1>) From c335ed60b89b4f63e964637220daff13839e8277 Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Wed, 10 Sep 2025 09:24:12 +0000 Subject: [PATCH 2/6] Use PatternMatch --- llvm/lib/Transforms/Scalar/GVN.cpp | 28 ++++++++--------- llvm/test/Transforms/GVN/masked-load-store.ll | 30 +++++++++---------- 2 files changed, 27 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 10325ab7c5737..7b43d922b7caa 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2299,30 +2299,26 @@ bool GVNPass::processMaskedLoad(IntrinsicInst *I) { if (!DepInst || !Dep.isLocal()) return false; - auto *MaskedStore = dyn_cast(DepInst); - if (!MaskedStore || MaskedStore->getIntrinsicID() != Intrinsic::masked_store) + Value *StoreVal; + if (!match(DepInst, + m_Intrinsic(m_Value(StoreVal), m_Value(), + m_Value(), m_Specific(Mask)))) return false; - auto StoreMask = MaskedStore->getOperand(3); - if (StoreMask != Mask) - return false; - - Value *OpToForward = - AvailableValue::get(MaskedStore->getOperand(0)).getSimpleValue(); - if (auto *LoadToForward = dyn_cast(OpToForward); - LoadToForward && - LoadToForward->getIntrinsicID() == Intrinsic::masked_load) { + Value *OpToForward = nullptr; + if (match(StoreVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(Mask), + m_Specific(Passthrough)))) // For MaskedLoad->MaskedStore->MaskedLoad, the mask must be the same for // all three instructions. The Passthrough on the two loads must also be the // same. - if (LoadToForward->getOperand(2) != Mask || - LoadToForward->getOperand(3) != Passthrough) - return false; - } else { + OpToForward = AvailableValue::get(StoreVal).getSimpleValue(); + else if (match(StoreVal, m_Intrinsic())) + return false; + else { // MaskedStore(Op, ptr, mask)->MaskedLoad(ptr, mask, passthrough) can be // replaced with MaskedStore(Op, ptr, mask)->select(mask, Op, passthrough) IRBuilder<> Builder(I); - OpToForward = Builder.CreateSelect(StoreMask, OpToForward, Passthrough); + OpToForward = Builder.CreateSelect(Mask, StoreVal, Passthrough); } I->replaceAllUsesWith(OpToForward); diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll index b32279941d0b0..77888476edb7d 100644 --- a/llvm/test/Transforms/GVN/masked-load-store.ll +++ b/llvm/test/Transforms/GVN/masked-load-store.ll @@ -42,11 +42,11 @@ define <4 x float> @forward_masked_load(ptr %0, ptr %1) { ; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true)) ; CHECK-NEXT: ret <4 x float> [[TMP4]] ; - %6 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4) - %7 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %6, <4 x float> zeroinitializer) - call void @llvm.masked.store.v4f32.p0(<4 x float> %7, ptr %1, i32 1, <4 x i1> %6) - %8 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %6, <4 x float> zeroinitializer) - ret <4 x float> %8 + %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4) + %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + call void @llvm.masked.store.v4f32.p0(<4 x float> %load1, ptr %1, i32 1, <4 x i1> %mask) + %load2 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + ret <4 x float> %load2 } define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) { @@ -96,11 +96,11 @@ define @forward_masked_load_scalable(ptr %0, ptr %1, [[TMP4]], ptr [[TMP1:%.*]], i32 1, [[TMP3]]) ; CHECK-NEXT: ret [[TMP4]] ; - %6 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) - %7 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %6, %passthrough) - call void @llvm.masked.store.nxv4f32.p0( %7, ptr %1, i32 1, %6) - %8 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %6, %passthrough) - ret %8 + %mask = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load1 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %mask, %passthrough) + call void @llvm.masked.store.nxv4f32.p0( %load1, ptr %1, i32 1, %mask) + %load2 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask, %passthrough) + ret %load2 } define @bail_on_different_passthrough(ptr %0, ptr %1, %passthrough) { @@ -111,11 +111,11 @@ define @bail_on_different_passthrough(ptr %0, ptr %1, @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, [[TMP3]], [[PASSTHROUGH:%.*]]) ; CHECK-NEXT: ret [[TMP5]] ; - %6 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) - %7 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %6, zeroinitializer) - call void @llvm.masked.store.nxv4f32.p0( %7, ptr %1, i32 1, %6) - %8 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %6, %passthrough) - ret %8 + %mask = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load1 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %mask, zeroinitializer) + call void @llvm.masked.store.nxv4f32.p0( %load1, ptr %1, i32 1, %mask) + %load2 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask, %passthrough) + ret %load2 } define @forward_binop_with_sel_scalable(ptr %0, ptr %1, %passthrough) { From 0d7107f2838dc06b6220f96615de1a04cff7d64c Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Wed, 10 Sep 2025 09:51:38 +0000 Subject: [PATCH 3/6] Remove intrinsic declarations --- llvm/test/Transforms/GVN/masked-load-store.ll | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll index 77888476edb7d..e9d3223055995 100644 --- a/llvm/test/Transforms/GVN/masked-load-store.ll +++ b/llvm/test/Transforms/GVN/masked-load-store.ll @@ -184,16 +184,3 @@ define @store_mask_differs(ptr %0, ptr %1, @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask0, %passthrough) ret %load.1.0 } - -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) -declare @llvm.masked.load.nxv4f32.p0(ptr captures(none), i32 immarg, , ) #1 - -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) -declare void @llvm.masked.store.nxv4f32.p0(, ptr captures(none), i32 immarg, ) #2 - -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) -declare @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32) #3 - -declare <128 x i8> @llvm.masked.load.v128i8.p0(ptr, i32, <128 x i1>, <128 x i8>) -declare void @llvm.masked.store.v128i8.p0(<128 x i8>, ptr, i32, <128 x i1>) - From 8f7fe4ae8a4048506aa6de46a4b37baa8f6d01a3 Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Tue, 23 Sep 2025 10:41:27 +0000 Subject: [PATCH 4/6] Respond to david-arm's review comments --- llvm/include/llvm/IR/PatternMatch.h | 8 ++++ llvm/lib/Transforms/Scalar/GVN.cpp | 44 +++++++------------ .../GVN/masked-load-store-no-mem-dep.ll | 34 ++++++++++++++ llvm/test/Transforms/GVN/masked-load-store.ll | 9 ++-- 4 files changed, 63 insertions(+), 32 deletions(-) create mode 100644 llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index 2cb78904dd799..8704f39436364 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -2781,6 +2781,14 @@ m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2, return m_Intrinsic(Op0, Op1, Op2, Op3); } +/// Matches MaskedStore Intrinsic. +template +inline typename m_Intrinsic_Ty::Ty +m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2, + const Opnd3 &Op3) { + return m_Intrinsic(Op0, Op1, Op2, Op3); +} + /// Matches MaskedGather Intrinsic. template inline typename m_Intrinsic_Ty::Ty diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 7b43d922b7caa..5aabe89df04b8 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -50,7 +50,6 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -2291,39 +2290,29 @@ bool GVNPass::processLoad(LoadInst *L) { // Attempt to process masked loads which have loaded from // masked stores with the same mask bool GVNPass::processMaskedLoad(IntrinsicInst *I) { - Value *Mask = I->getOperand(2); - Value *Passthrough = I->getOperand(3); - + if (!MD) + return false; MemDepResult Dep = MD->getDependency(I); Instruction *DepInst = Dep.getInst(); - if (!DepInst || !Dep.isLocal()) + if (!DepInst || !Dep.isLocal() || !Dep.isDef()) return false; + Value *Mask = I->getOperand(2); + Value *Passthrough = I->getOperand(3); Value *StoreVal; - if (!match(DepInst, - m_Intrinsic(m_Value(StoreVal), m_Value(), - m_Value(), m_Specific(Mask)))) + if (!match(DepInst, m_MaskedStore(m_Value(StoreVal), m_Value(), m_Value(), + m_Specific(Mask)))) return false; - Value *OpToForward = nullptr; - if (match(StoreVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(Mask), - m_Specific(Passthrough)))) - // For MaskedLoad->MaskedStore->MaskedLoad, the mask must be the same for - // all three instructions. The Passthrough on the two loads must also be the - // same. - OpToForward = AvailableValue::get(StoreVal).getSimpleValue(); - else if (match(StoreVal, m_Intrinsic())) - return false; - else { - // MaskedStore(Op, ptr, mask)->MaskedLoad(ptr, mask, passthrough) can be - // replaced with MaskedStore(Op, ptr, mask)->select(mask, Op, passthrough) - IRBuilder<> Builder(I); - OpToForward = Builder.CreateSelect(Mask, StoreVal, Passthrough); - } + // Remove the load but generate a select for the passthrough + Value *OpToForward = llvm::SelectInst::Create(Mask, StoreVal, Passthrough, "", + I->getIterator()); - I->replaceAllUsesWith(OpToForward); ICF->removeUsersOf(I); + I->replaceAllUsesWith(OpToForward); salvageAndRemoveInstruction(I); + if (OpToForward->getType()->isPtrOrPtrVectorTy()) + MD->invalidateCachedPointerInfo(OpToForward); ++NumGVNLoad; return true; } @@ -2775,10 +2764,9 @@ bool GVNPass::processInstruction(Instruction *I) { return false; } - if (auto *II = dyn_cast(I)) - if (II && II->getIntrinsicID() == Intrinsic::masked_load) - if (processMaskedLoad(II)) - return true; + if (match(I, m_Intrinsic()) && + processMaskedLoad(cast(I))) + return true; // For conditional branches, we can perform simple conditional propagation on // the condition value itself. diff --git a/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll new file mode 100644 index 0000000000000..512ea37641ab9 --- /dev/null +++ b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=gvn -S -enable-gvn-memdep=true < %s | FileCheck %s +; RUN: opt -passes=gvn -S -enable-gvn-memdep=false < %s | FileCheck %s --check-prefix=MEMDEPFALSE + +define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) { +; CHECK-LABEL: @forward_binop_with_sel( +; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]]) +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP3]] +; +; MEMDEPFALSE-LABEL: @forward_binop_with_sel( +; MEMDEPFALSE-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; MEMDEPFALSE-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; MEMDEPFALSE-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; MEMDEPFALSE-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) +; MEMDEPFALSE-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] +; MEMDEPFALSE-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]]) +; MEMDEPFALSE-NEXT: [[LOAD_1_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 1, <4 x i1> [[MASK]], <4 x float> [[PASSTHROUGH:%.*]]) +; MEMDEPFALSE-NEXT: ret <4 x float> [[LOAD_1_0]] +; + %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b) + %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %gep.0.16 = getelementptr i8, ptr %0, i32 16 + %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + %fmul = fmul <4 x float> %load.0.0, %load.0.16 + call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask) + %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough) + ret <4 x float> %load.1.0 +} diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll index e9d3223055995..71b8f503aeadb 100644 --- a/llvm/test/Transforms/GVN/masked-load-store.ll +++ b/llvm/test/Transforms/GVN/masked-load-store.ll @@ -94,7 +94,8 @@ define @forward_masked_load_scalable(ptr %0, ptr %1, @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) ; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, [[TMP3]], [[PASSTHROUGH:%.*]]) ; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP4]], ptr [[TMP1:%.*]], i32 1, [[TMP3]]) -; CHECK-NEXT: ret [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = select [[TMP3]], [[TMP4]], [[PASSTHROUGH]] +; CHECK-NEXT: ret [[TMP5]] ; %mask = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) %load1 = call @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, %mask, %passthrough) @@ -103,12 +104,12 @@ define @forward_masked_load_scalable(ptr %0, ptr %1, %load2 } -define @bail_on_different_passthrough(ptr %0, ptr %1, %passthrough) { -; CHECK-LABEL: @bail_on_different_passthrough( +define @generate_sel_with_passthrough(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @generate_sel_with_passthrough( ; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) ; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, [[TMP3]], zeroinitializer) ; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP4]], ptr [[TMP1:%.*]], i32 1, [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, [[TMP3]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = select [[TMP3]], [[TMP4]], [[PASSTHROUGH:%.*]] ; CHECK-NEXT: ret [[TMP5]] ; %mask = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) From a3a6f9f64b5b0895cd9f4e7814596f9e04dbb293 Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Mon, 29 Sep 2025 16:47:15 +0000 Subject: [PATCH 5/6] Remove pointer check and extra test --- llvm/lib/Transforms/Scalar/GVN.cpp | 2 -- llvm/test/Transforms/GVN/masked-load-store.ll | 13 +++++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 5aabe89df04b8..0583754d9bf75 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2311,8 +2311,6 @@ bool GVNPass::processMaskedLoad(IntrinsicInst *I) { ICF->removeUsersOf(I); I->replaceAllUsesWith(OpToForward); salvageAndRemoveInstruction(I); - if (OpToForward->getType()->isPtrOrPtrVectorTy()) - MD->invalidateCachedPointerInfo(OpToForward); ++NumGVNLoad; return true; } diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll index 71b8f503aeadb..9b72392c4c61b 100644 --- a/llvm/test/Transforms/GVN/masked-load-store.ll +++ b/llvm/test/Transforms/GVN/masked-load-store.ll @@ -49,6 +49,19 @@ define <4 x float> @forward_masked_load(ptr %0, ptr %1) { ret <4 x float> %load2 } +define <4 x float> @forward_masked_load_arbitrary_mask(ptr %loc_a, ptr %loc_b, <4 x i1> %mask) { +; CHECK-LABEL: @forward_masked_load_arbitrary_mask( +; CHECK-NEXT: [[LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[LOC_A:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD1]], ptr [[LOC_B:%.*]], i32 1, <4 x i1> [[MASK]]) +; CHECK-NEXT: [[TMP1:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[LOAD1]], <4 x float> zeroinitializer +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; + %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %loc_a, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + call void @llvm.masked.store.v4f32.p0(<4 x float> %load1, ptr %loc_b, i32 1, <4 x i1> %mask) + %load2 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %loc_b, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) + ret <4 x float> %load2 +} + define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) { ; CHECK-LABEL: @forward_binop_splat_i1_mask( ; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) From 66937048c015055e36fb0bef8524c86ec73db0fc Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Tue, 30 Sep 2025 10:24:36 +0000 Subject: [PATCH 6/6] Add bail and test for type mismatch --- llvm/lib/Transforms/Scalar/GVN.cpp | 3 ++- llvm/test/Transforms/GVN/masked-load-store.ll | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 0583754d9bf75..b9b5b5823d780 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2301,7 +2301,8 @@ bool GVNPass::processMaskedLoad(IntrinsicInst *I) { Value *Passthrough = I->getOperand(3); Value *StoreVal; if (!match(DepInst, m_MaskedStore(m_Value(StoreVal), m_Value(), m_Value(), - m_Specific(Mask)))) + m_Specific(Mask))) || + StoreVal->getType() != I->getType()) return false; // Remove the load but generate a select for the passthrough diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll index 9b72392c4c61b..b112e990e0c58 100644 --- a/llvm/test/Transforms/GVN/masked-load-store.ll +++ b/llvm/test/Transforms/GVN/masked-load-store.ll @@ -117,6 +117,21 @@ define @forward_masked_load_scalable(ptr %0, ptr %1, %load2 } +define @forward_masked_load_scalable_type_mismatch(ptr %0, ptr %1, %passthrough) { +; CHECK-LABEL: @forward_masked_load_scalable_type_mismatch( +; CHECK-NEXT: [[MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[LOAD1:%.*]] = call @llvm.masked.load.nxv4f64.p0(ptr [[TMP0:%.*]], i32 1, [[MASK]], zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.nxv4f64.p0( [[LOAD1]], ptr [[TMP1:%.*]], i32 1, [[MASK]]) +; CHECK-NEXT: [[LOAD2:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, [[MASK]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: ret [[LOAD2]] +; + %mask = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) + %load1 = call @llvm.masked.load.nxv4f64.p0(ptr %0, i32 1, %mask, zeroinitializer) + call void @llvm.masked.store.nxv4f64.p0( %load1, ptr %1, i32 1, %mask) + %load2 = call @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, %mask, %passthrough) + ret %load2 +} + define @generate_sel_with_passthrough(ptr %0, ptr %1, %passthrough) { ; CHECK-LABEL: @generate_sel_with_passthrough( ; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)