diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h index 29b797cee260d6..e48a70164bec6c 100644 --- a/llvm/include/llvm/MCA/Instruction.h +++ b/llvm/include/llvm/MCA/Instruction.h @@ -458,9 +458,6 @@ struct InstrDesc { // A bitmask of used processor resource units. uint64_t UsedProcResUnits; - // A bitmask of implicit uses of processor resource units. - uint64_t ImplicitlyUsedProcResUnits; - // A bitmask of used processor resource groups. uint64_t UsedProcResGroups; @@ -481,6 +478,9 @@ struct InstrDesc { // recycled. unsigned IsRecyclable : 1; + // True if some of the consumed group resources are partially overlapping. + unsigned HasPartiallyOverlappingGroups : 1; + // A zero latency instruction doesn't consume any scheduler resources. bool isZeroLatency() const { return !MaxLatency && Resources.empty(); } diff --git a/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp b/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp index 3687a24279c2eb..600fe5b7a18729 100644 --- a/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp +++ b/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp @@ -281,26 +281,67 @@ void ResourceManager::releaseBuffers(uint64_t ConsumedBuffers) { uint64_t ResourceManager::checkAvailability(const InstrDesc &Desc) const { uint64_t BusyResourceMask = 0; + uint64_t ConsumedResourceMask = 0; + DenseMap AvailableUnits; + for (const std::pair &E : Desc.Resources) { unsigned NumUnits = E.second.isReserved() ? 0U : E.second.NumUnits; - unsigned Index = getResourceStateIndex(E.first); - if (!Resources[Index]->isReady(NumUnits)) + const ResourceState &RS = *Resources[getResourceStateIndex(E.first)]; + if (!RS.isReady(NumUnits)) { BusyResourceMask |= E.first; - } + continue; + } - uint64_t ImplicitUses = Desc.ImplicitlyUsedProcResUnits; - while (ImplicitUses) { - uint64_t Use = ImplicitUses & -ImplicitUses; - ImplicitUses ^= Use; - unsigned Index = getResourceStateIndex(Use); - if (!Resources[Index]->isReady(/* NumUnits */ 1)) - BusyResourceMask |= Index; + if (Desc.HasPartiallyOverlappingGroups && !RS.isAResourceGroup()) { + unsigned NumAvailableUnits = countPopulation(RS.getReadyMask()); + NumAvailableUnits -= NumUnits; + AvailableUnits[E.first] = NumAvailableUnits; + if (!NumAvailableUnits) + ConsumedResourceMask |= E.first; + } } BusyResourceMask &= ProcResUnitMask; if (BusyResourceMask) return BusyResourceMask; - return Desc.UsedProcResGroups & ReservedResourceGroups; + + BusyResourceMask = Desc.UsedProcResGroups & ReservedResourceGroups; + if (!Desc.HasPartiallyOverlappingGroups || BusyResourceMask) + return BusyResourceMask; + + // If this instruction has overlapping groups, make sure that we can + // select at least one unit per group. + for (const std::pair &E : Desc.Resources) { + const ResourceState &RS = *Resources[getResourceStateIndex(E.first)]; + if (!E.second.isReserved() && RS.isAResourceGroup()) { + uint64_t ReadyMask = RS.getReadyMask() & ~ConsumedResourceMask; + if (!ReadyMask) { + BusyResourceMask |= RS.getReadyMask(); + continue; + } + + uint64_t ResourceMask = PowerOf2Floor(ReadyMask); + + auto it = AvailableUnits.find(ResourceMask); + if (it == AvailableUnits.end()) { + unsigned Index = getResourceStateIndex(ResourceMask); + unsigned NumUnits = countPopulation(Resources[Index]->getReadyMask()); + it = + AvailableUnits.insert(std::make_pair(ResourceMask, NumUnits)).first; + } + + if (!it->second) { + BusyResourceMask |= it->first; + continue; + } + + it->second--; + if (!it->second) + ConsumedResourceMask |= it->first; + } + } + + return BusyResourceMask; } void ResourceManager::issueInstruction( diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp index 45acea25358731..71c565236e8875 100644 --- a/llvm/lib/MCA/InstrBuilder.cpp +++ b/llvm/lib/MCA/InstrBuilder.cpp @@ -112,13 +112,12 @@ static void initializeUsedResources(InstrDesc &ID, uint64_t UsedResourceUnits = 0; uint64_t UsedResourceGroups = 0; - auto GroupIt = find_if(Worklist, [](const ResourcePlusCycles &Elt) { - return countPopulation(Elt.first) > 1; - }); - unsigned FirstGroupIdx = std::distance(Worklist.begin(), GroupIt); - uint64_t ImpliedUsesOfResourceUnits = 0; + uint64_t UnitsFromResourceGroups = 0; + + // Remove cycles contributed by smaller resources, and check if there + // are partially overlapping resource groups. + ID.HasPartiallyOverlappingGroups = false; - // Remove cycles contributed by smaller resources. for (unsigned I = 0, E = Worklist.size(); I < E; ++I) { ResourcePlusCycles &A = Worklist[I]; if (!A.second.size()) { @@ -129,21 +128,17 @@ static void initializeUsedResources(InstrDesc &ID, ID.Resources.emplace_back(A); uint64_t NormalizedMask = A.first; + if (countPopulation(A.first) == 1) { UsedResourceUnits |= A.first; } else { // Remove the leading 1 from the resource group mask. NormalizedMask ^= PowerOf2Floor(NormalizedMask); - UsedResourceGroups |= (A.first ^ NormalizedMask); + if (UnitsFromResourceGroups & NormalizedMask) + ID.HasPartiallyOverlappingGroups = true; - uint64_t AvailableMask = NormalizedMask & ~UsedResourceUnits; - if ((NormalizedMask != AvailableMask) && - countPopulation(AvailableMask) == 1) { - // At simulation time, this resource group use will decay into a simple - // use of the resource unit identified by `AvailableMask`. - ImpliedUsesOfResourceUnits |= AvailableMask; - UsedResourceUnits |= AvailableMask; - } + UnitsFromResourceGroups |= NormalizedMask; + UsedResourceGroups |= (A.first ^ NormalizedMask); } for (unsigned J = I + 1; J < E; ++J) { @@ -156,31 +151,6 @@ static void initializeUsedResources(InstrDesc &ID, } } - // Look for implicit uses of processor resource units. These are resource - // units which are indirectly consumed by resource groups, and that must be - // always available on instruction issue. - while (ImpliedUsesOfResourceUnits) { - ID.ImplicitlyUsedProcResUnits |= ImpliedUsesOfResourceUnits; - ImpliedUsesOfResourceUnits = 0; - for (unsigned I = FirstGroupIdx, E = Worklist.size(); I < E; ++I) { - ResourcePlusCycles &A = Worklist[I]; - if (!A.second.size()) - continue; - - uint64_t NormalizedMask = A.first; - assert(countPopulation(NormalizedMask) > 1); - // Remove the leading 1 from the resource group mask. - NormalizedMask ^= PowerOf2Floor(NormalizedMask); - uint64_t AvailableMask = NormalizedMask & ~UsedResourceUnits; - if ((NormalizedMask != AvailableMask) && - countPopulation(AvailableMask) != 1) - continue; - - UsedResourceUnits |= AvailableMask; - ImpliedUsesOfResourceUnits |= AvailableMask; - } - } - // A SchedWrite may specify a number of cycles in which a resource group // is reserved. For example (on target x86; cpu Haswell): // @@ -240,10 +210,10 @@ static void initializeUsedResources(InstrDesc &ID, BufferIDs ^= Current; } dbgs() << "\t\t Used Units=" << format_hex(ID.UsedProcResUnits, 16) << '\n'; - dbgs() << "\t\tImplicitly Used Units=" - << format_hex(ID.ImplicitlyUsedProcResUnits, 16) << '\n'; dbgs() << "\t\tUsed Groups=" << format_hex(ID.UsedProcResGroups, 16) << '\n'; + dbgs() << "\t\tHasPartiallyOverlappingGroups=" + << ID.HasPartiallyOverlappingGroups << '\n'; }); } diff --git a/llvm/test/tools/llvm-mca/X86/AlderlakeP/partially-overlapping-groups.s b/llvm/test/tools/llvm-mca/X86/AlderlakeP/partially-overlapping-groups.s new file mode 100644 index 00000000000000..6229ddb5909eff --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/AlderlakeP/partially-overlapping-groups.s @@ -0,0 +1,21 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=alderlake -all-views=false -summary-view < %s | FileCheck %s + +# Issue #57548 + +# Do not crash when simulating instructions that consume partially overlapping +# resource groups. + +vpsllw %xmm1, %ymm0, %ymm0 +vpsllw %xmm1, %xmm2, %xmm1 +vpand %ymm1, %ymm0, %ymm0 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 300 +# CHECK-NEXT: Total Cycles: 503 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 0.99 +# CHECK-NEXT: IPC: 0.60 +# CHECK-NEXT: Block RThroughput: 1.0