From 513c98602fe4cbed34eddb5d2cbfbf242ad52e24 Mon Sep 17 00:00:00 2001 From: Michael Bedy Date: Thu, 13 Nov 2025 13:49:14 -0500 Subject: [PATCH 1/3] [SLP] Invariant loads cannot have a memory dependency on stores. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 19 ++++- .../AMDGPU/invariant-load-no-alias-store.ll | 82 +++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cc53b0dd3577e..e67e607ee1632 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -21623,6 +21623,17 @@ void BoUpSLP::BlockScheduling::calculateDependencies( } } + // Helper to detect loads marked with !invariant.load metadata. Such loads + // are defined to read from memory that never changes for the lifetime of + // the program; any store to the same location would be UB. Therefore we + // can conservatively treat an invariant load and any store as non-aliasing + // for scheduling/dep purposes and skip creating a dependency edge. + auto IsInvariantLoad = [](const Instruction *I) { + if (const auto *LI = dyn_cast(I)) + return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr; + return false; + }; + // Handle the memory dependencies (if any). ScheduleData *NextLoadStore = BundleMember->getNextLoadStore(); if (!NextLoadStore) @@ -21636,10 +21647,15 @@ void BoUpSLP::BlockScheduling::calculateDependencies( unsigned DistToSrc = 1; bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst); + if (IsInvariantLoad(SrcInst)) + return; // Invariant load cannot have memory dependencies. + for (ScheduleData *DepDest = NextLoadStore; DepDest; DepDest = DepDest->getNextLoadStore()) { assert(isInSchedulingRegion(*DepDest) && "Expected to be in region"); + Instruction *DestInst = DepDest->getInst(); + // We have two limits to reduce the complexity: // 1) AliasedCheckLimit: It's a small limit to reduce calls to // SLP->isAliased (which is the expensive part in this loop). @@ -21648,7 +21664,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies( // It's important for the loop break condition (see below) to // check this limit even between two read-only instructions. if (DistToSrc >= MaxMemDepDistance || - ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) && + (!IsInvariantLoad(DestInst) && // Cannot have memory deps. + (SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) && (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit || SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) { diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll new file mode 100644 index 0000000000000..00bd0feb8c0fa --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes="function(slp-vectorizer)" -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 %s -S | FileCheck %s + +define void @test(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0 +; CHECK-NEXT: [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0 +; CHECK-NEXT: [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[A0PTR]], align 2, !invariant.load [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[B0PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc <2 x half> [[TMP0]], [[TMP1]] +; CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[P0]], align 2 +; CHECK-NEXT: ret void +; +entry: + %p0 = getelementptr half, ptr addrspace(1) %base, i32 0 + %p1 = getelementptr half, ptr addrspace(1) %base, i32 1 + ; First pair of invariant loads from otherA. + %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0 + %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0 + %A0 = load half, ptr addrspace(1) %A0PTR, align 2, !invariant.load !0 + %B0 = load half, ptr addrspace(1) %B0PTR, align 2, !invariant.load !0 + %add0 = fadd reassoc half %A0, %B0 + store half %add0, ptr addrspace(1) %p0, align 2 + %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1 + %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1 + %A1 = load half, ptr addrspace(1) %A1PTR, align 2, !invariant.load !0 + %B1 = load half, ptr addrspace(1) %B1PTR, align 2, !invariant.load !0 + %add1 = fadd reassoc half %A1, %B1 + store half %add1, ptr addrspace(1) %p1, align 2 + ret void +} + + +define void @aliastest(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 { +; CHECK-LABEL: define void @aliastest( +; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0 +; CHECK-NEXT: [[P1:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 1 +; CHECK-NEXT: [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0 +; CHECK-NEXT: [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0 +; CHECK-NEXT: [[A0:%.*]] = load half, ptr addrspace(1) [[A0PTR]], align 2 +; CHECK-NEXT: [[B0:%.*]] = load half, ptr addrspace(1) [[B0PTR]], align 2 +; CHECK-NEXT: [[ADD0:%.*]] = fadd reassoc half [[A0]], [[B0]] +; CHECK-NEXT: store half [[ADD0]], ptr addrspace(1) [[P0]], align 2 +; CHECK-NEXT: [[A1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 1 +; CHECK-NEXT: [[B1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 1 +; CHECK-NEXT: [[A1:%.*]] = load half, ptr addrspace(1) [[A1PTR]], align 2 +; CHECK-NEXT: [[B1:%.*]] = load half, ptr addrspace(1) [[B1PTR]], align 2 +; CHECK-NEXT: [[ADD1:%.*]] = fadd reassoc half [[A1]], [[B1]] +; CHECK-NEXT: store half [[ADD1]], ptr addrspace(1) [[P1]], align 2 +; CHECK-NEXT: ret void +; +entry: + %p0 = getelementptr half, ptr addrspace(1) %base, i32 0 + %p1 = getelementptr half, ptr addrspace(1) %base, i32 1 + ; First pair of invariant loads from otherA. + %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0 + %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0 + %A0 = load half, ptr addrspace(1) %A0PTR, align 2 + %B0 = load half, ptr addrspace(1) %B0PTR, align 2 + %add0 = fadd reassoc half %A0, %B0 + store half %add0, ptr addrspace(1) %p0, align 2 + %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1 + %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1 + %A1 = load half, ptr addrspace(1) %A1PTR, align 2 + %B1 = load half, ptr addrspace(1) %B1PTR, align 2 + %add1 = fadd reassoc half %A1, %B1 + store half %add1, ptr addrspace(1) %p1, align 2 + ret void +} + + +attributes #0 = { nounwind } + +!0 = !{} +;. +; CHECK: [[META0]] = !{} +;. From 4d014b8e1e3f004248eaefbd50353a4147bcd4cb Mon Sep 17 00:00:00 2001 From: Michael Bedy Date: Thu, 13 Nov 2025 22:10:06 -0500 Subject: [PATCH 2/3] Update location; exclude invariant (and simple) loads from memory inst scheduler data list. Add volatile test. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 30 ++++++-------- .../AMDGPU/invariant-load-no-alias-store.ll | 39 +++++++++++++++++++ 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e67e607ee1632..8cf5f9e133b6e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -21412,7 +21412,18 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, "new ScheduleData already in scheduling region"); SD->init(SchedulingRegionID, I); + auto CanIgnoreLoad = [](const Instruction *I) { + const LoadInst *LI = dyn_cast(I); + // If there is a simple load marked as invariant, we can ignore it. + // But, in the (unlikely) case of non-simple invariant load, + // we should not ignore it. + return LI && LI->isSimple() && + LI->getMetadata(LLVMContext::MD_invariant_load); + }; + if (I->mayReadOrWriteMemory() && + // Simple InvariantLoad does not depend on other memory accesses. + !CanIgnoreLoad(I) && (!isa(I) || (cast(I)->getIntrinsicID() != Intrinsic::sideeffect && cast(I)->getIntrinsicID() != @@ -21623,17 +21634,6 @@ void BoUpSLP::BlockScheduling::calculateDependencies( } } - // Helper to detect loads marked with !invariant.load metadata. Such loads - // are defined to read from memory that never changes for the lifetime of - // the program; any store to the same location would be UB. Therefore we - // can conservatively treat an invariant load and any store as non-aliasing - // for scheduling/dep purposes and skip creating a dependency edge. - auto IsInvariantLoad = [](const Instruction *I) { - if (const auto *LI = dyn_cast(I)) - return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr; - return false; - }; - // Handle the memory dependencies (if any). ScheduleData *NextLoadStore = BundleMember->getNextLoadStore(); if (!NextLoadStore) @@ -21647,15 +21647,10 @@ void BoUpSLP::BlockScheduling::calculateDependencies( unsigned DistToSrc = 1; bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst); - if (IsInvariantLoad(SrcInst)) - return; // Invariant load cannot have memory dependencies. - for (ScheduleData *DepDest = NextLoadStore; DepDest; DepDest = DepDest->getNextLoadStore()) { assert(isInSchedulingRegion(*DepDest) && "Expected to be in region"); - Instruction *DestInst = DepDest->getInst(); - // We have two limits to reduce the complexity: // 1) AliasedCheckLimit: It's a small limit to reduce calls to // SLP->isAliased (which is the expensive part in this loop). @@ -21664,8 +21659,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies( // It's important for the loop break condition (see below) to // check this limit even between two read-only instructions. if (DistToSrc >= MaxMemDepDistance || - (!IsInvariantLoad(DestInst) && // Cannot have memory deps. - (SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) && + ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) && (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit || SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) { diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll index 00bd0feb8c0fa..87537c05573ae 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll @@ -73,6 +73,45 @@ entry: ret void } +define void @voltest(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 { +; CHECK-LABEL: define void @voltest( +; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0 +; CHECK-NEXT: [[P1:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 1 +; CHECK-NEXT: [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0 +; CHECK-NEXT: [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0 +; CHECK-NEXT: [[A0:%.*]] = load volatile half, ptr addrspace(1) [[A0PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[B0:%.*]] = load volatile half, ptr addrspace(1) [[B0PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[ADD0:%.*]] = fadd reassoc half [[A0]], [[B0]] +; CHECK-NEXT: store half [[ADD0]], ptr addrspace(1) [[P0]], align 2 +; CHECK-NEXT: [[A1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 1 +; CHECK-NEXT: [[B1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 1 +; CHECK-NEXT: [[A1:%.*]] = load volatile half, ptr addrspace(1) [[A1PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[B1:%.*]] = load volatile half, ptr addrspace(1) [[B1PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[ADD1:%.*]] = fadd reassoc half [[A1]], [[B1]] +; CHECK-NEXT: store half [[ADD1]], ptr addrspace(1) [[P1]], align 2 +; CHECK-NEXT: ret void +; +entry: + %p0 = getelementptr half, ptr addrspace(1) %base, i32 0 + %p1 = getelementptr half, ptr addrspace(1) %base, i32 1 + ; First pair of invariant loads from otherA. + %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0 + %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0 + %A0 = load volatile half, ptr addrspace(1) %A0PTR, align 2, !invariant.load !0 + %B0 = load volatile half, ptr addrspace(1) %B0PTR, align 2, !invariant.load !0 + %add0 = fadd reassoc half %A0, %B0 + store half %add0, ptr addrspace(1) %p0, align 2 + %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1 + %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1 + %A1 = load volatile half, ptr addrspace(1) %A1PTR, align 2, !invariant.load !0 + %B1 = load volatile half, ptr addrspace(1) %B1PTR, align 2, !invariant.load !0 + %add1 = fadd reassoc half %A1, %B1 + store half %add1, ptr addrspace(1) %p1, align 2 + ret void +} + attributes #0 = { nounwind } From d6fdd3d2f09812d3d2146b400da7bf6eb5804d96 Mon Sep 17 00:00:00 2001 From: Michael Bedy Date: Fri, 14 Nov 2025 14:38:50 -0500 Subject: [PATCH 3/3] Fix nit. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8cf5f9e133b6e..8231f5234eac2 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -21413,7 +21413,7 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, SD->init(SchedulingRegionID, I); auto CanIgnoreLoad = [](const Instruction *I) { - const LoadInst *LI = dyn_cast(I); + const auto *LI = dyn_cast(I); // If there is a simple load marked as invariant, we can ignore it. // But, in the (unlikely) case of non-simple invariant load, // we should not ignore it.