Skip to content

Commit

Permalink
[GlobalISel] Implement merging of stores of truncates.
Browse files Browse the repository at this point in the history
This is a port of a combine which matches a pattern where a wide type scalar
value is stored by several narrow stores. It folds it into a single store or
a BSWAP and a store if the targets supports it.

Assuming little endian target:
 i8 *p = ...
 i32 val = ...
 p[0] = (val >> 0) & 0xFF;
 p[1] = (val >> 8) & 0xFF;
 p[2] = (val >> 16) & 0xFF;
 p[3] = (val >> 24) & 0xFF;
=>
 *((i32)p) = val;

On CTMark AArch64 -Os this results in a good amount of savings:

Program            before        after       diff
             SPASS 412792       412788       -0.0%
                kc 432528       432512       -0.0%
            lencod 430112       430096       -0.0%
  consumer-typeset 419156       419128       -0.0%
            bullet 475840       475752       -0.0%
        tramp3d-v4 367760       367628       -0.0%
          clamscan 383388       383204       -0.0%
    pairlocalalign 249764       249476       -0.1%
    7zip-benchmark 570100       568860       -0.2%
           sqlite3 287628       286920       -0.2%
Geomean difference                           -0.1%

Differential Revision: https://reviews.llvm.org/D109419
  • Loading branch information
aemerson committed Sep 9, 2021
1 parent e741fab commit eae44c8
Show file tree
Hide file tree
Showing 5 changed files with 1,373 additions and 2 deletions.
11 changes: 11 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
Expand Up @@ -74,6 +74,14 @@ struct ShiftOfShiftedLogic {

using BuildFnTy = std::function<void(MachineIRBuilder &)>;

struct MergeTruncStoresInfo {
SmallVector<GStore *> FoundStores;
GStore *LowestIdxStore = nullptr;
Register WideSrcVal;
bool NeedBSwap = false;
bool NeedRotate = false;
};

using OperandBuildSteps =
SmallVector<std::function<void(MachineInstrBuilder &)>, 4>;
struct InstructionBuildSteps {
Expand Down Expand Up @@ -523,6 +531,9 @@ class CombinerHelper {
/// bswap.
bool matchLoadOrCombine(MachineInstr &MI, BuildFnTy &MatchInfo);

bool matchTruncStoreMerge(MachineInstr &MI, MergeTruncStoresInfo &MatchInfo);
void applyTruncStoreMerge(MachineInstr &MI, MergeTruncStoresInfo &MatchInfo);

bool matchExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI);
void applyExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI);

Expand Down
12 changes: 10 additions & 2 deletions llvm/include/llvm/Target/GlobalISel/Combine.td
Expand Up @@ -594,6 +594,14 @@ def load_or_combine : GICombineRule<
[{ return Helper.matchLoadOrCombine(*${root}, ${info}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;


def truncstore_merge_matcdata : GIDefMatchData<"MergeTruncStoresInfo">;
def truncstore_merge : GICombineRule<
(defs root:$root, truncstore_merge_matcdata:$info),
(match (wip_match_opcode G_STORE):$root,
[{ return Helper.matchTruncStoreMerge(*${root}, ${info}); }]),
(apply [{ Helper.applyTruncStoreMerge(*${root}, ${info}); }])>;

def extend_through_phis_matchdata: GIDefMatchData<"MachineInstr*">;
def extend_through_phis : GICombineRule<
(defs root:$root, extend_through_phis_matchdata:$matchinfo),
Expand Down Expand Up @@ -733,8 +741,8 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
unmerge_zext_to_zext, merge_unmerge, trunc_ext_fold, trunc_shl,
const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
div_rem_to_divrem, funnel_shift_combines, form_bitfield_extract,
constant_fold]>;
truncstore_merge, div_rem_to_divrem, funnel_shift_combines,
form_bitfield_extract, constant_fold]>;

// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
Expand Down
267 changes: 267 additions & 0 deletions llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
Expand Up @@ -28,6 +28,8 @@
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/MathExtras.h"
#include <tuple>

Expand Down Expand Up @@ -3266,6 +3268,271 @@ bool CombinerHelper::matchLoadOrCombine(
return true;
}

/// Check if the store \p Store is a truncstore that can be merged. That is,
/// it's a store of a shifted value of \p SrcVal. If \p SrcVal is an empty
/// Register then it does not need to match and SrcVal is set to the source
/// value found.
/// On match, returns the start byte offset of the \p SrcVal that is being
/// stored.
static Optional<int64_t> getTruncStoreByteOffset(GStore &Store, Register &SrcVal,
MachineRegisterInfo &MRI) {
Register TruncVal;
if (!mi_match(Store.getValueReg(), MRI, m_GTrunc(m_Reg(TruncVal))))
return None;

// The shift amount must be a constant multiple of the narrow type.
// It is translated to the offset address in the wide source value "y".
//
// x = G_LSHR y, ShiftAmtC
// s8 z = G_TRUNC x
// store z, ...
Register FoundSrcVal;
int64_t ShiftAmt;
if (!mi_match(TruncVal, MRI,
m_any_of(m_GLShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt)),
m_GAShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt))))) {
if (!SrcVal.isValid() || TruncVal == SrcVal) {
if (!SrcVal.isValid())
SrcVal = TruncVal;
return 0; // If it's the lowest index store.
}
return None;
}

unsigned NarrowBits = Store.getMMO().getMemoryType().getScalarSizeInBits();
if (ShiftAmt % NarrowBits!= 0)
return None;
const unsigned Offset = ShiftAmt / NarrowBits;

if (SrcVal.isValid() && FoundSrcVal != SrcVal)
return None;

if (!SrcVal.isValid())
SrcVal = FoundSrcVal;
else if (MRI.getType(SrcVal) != MRI.getType(FoundSrcVal))
return None;
return Offset;
}

/// Match a pattern where a wide type scalar value is stored by several narrow
/// stores. Fold it into a single store or a BSWAP and a store if the targets
/// supports it.
///
/// Assuming little endian target:
/// i8 *p = ...
/// i32 val = ...
/// p[0] = (val >> 0) & 0xFF;
/// p[1] = (val >> 8) & 0xFF;
/// p[2] = (val >> 16) & 0xFF;
/// p[3] = (val >> 24) & 0xFF;
/// =>
/// *((i32)p) = val;
///
/// i8 *p = ...
/// i32 val = ...
/// p[0] = (val >> 24) & 0xFF;
/// p[1] = (val >> 16) & 0xFF;
/// p[2] = (val >> 8) & 0xFF;
/// p[3] = (val >> 0) & 0xFF;
/// =>
/// *((i32)p) = BSWAP(val);
bool CombinerHelper::matchTruncStoreMerge(MachineInstr &MI,
MergeTruncStoresInfo &MatchInfo) {
auto &StoreMI = cast<GStore>(MI);
LLT MemTy = StoreMI.getMMO().getMemoryType();

// We only handle merging simple stores of 1-4 bytes.
if (!MemTy.isScalar())
return false;
switch (MemTy.getSizeInBits()) {
case 8:
case 16:
case 32:
break;
default:
return false;
}
if (!StoreMI.isSimple())
return false;

// We do a simple search for mergeable stores prior to this one.
// Any potential alias hazard along the way terminates the search.
SmallVector<GStore *> FoundStores;

// We're looking for:
// 1) a (store(trunc(...)))
// 2) of an LSHR/ASHR of a single wide value, by the appropriate shift to get
// the partial value stored.
// 3) where the offsets form either a little or big-endian sequence.

auto &LastStore = StoreMI;

// The single base pointer that all stores must use.
Register BaseReg;
int64_t LastOffset;
if (!mi_match(LastStore.getPointerReg(), MRI,
m_GPtrAdd(m_Reg(BaseReg), m_ICst(LastOffset)))) {
BaseReg = LastStore.getPointerReg();
LastOffset = 0;
}

GStore *LowestIdxStore = &LastStore;
int64_t LowestIdxOffset = LastOffset;

Register WideSrcVal;
auto LowestShiftAmt = getTruncStoreByteOffset(LastStore, WideSrcVal, MRI);
if (!LowestShiftAmt)
return false; // Didn't match a trunc.
assert(WideSrcVal.isValid());

LLT WideStoreTy = MRI.getType(WideSrcVal);
const unsigned NumStoresRequired =
WideStoreTy.getSizeInBits() / MemTy.getSizeInBits();

SmallVector<int64_t, 8> OffsetMap(NumStoresRequired, INT64_MAX);
OffsetMap[*LowestShiftAmt] = LastOffset;
FoundStores.emplace_back(&LastStore);

// Search the block up for more stores.
// We use a search threshold of 10 instructions here because the combiner
// works top-down within a block, and we don't want to search an unbounded
// number of predecessor instructions trying to find matching stores.
// If we moved this optimization into a separate pass then we could probably
// use a more efficient search without having a hard-coded threshold.
const int MaxInstsToCheck = 10;
int NumInstsChecked = 0;
for (auto II = ++LastStore.getReverseIterator();
II != LastStore.getParent()->rend() && NumInstsChecked < MaxInstsToCheck;
++II) {
NumInstsChecked++;
GStore *NewStore;
if ((NewStore = dyn_cast<GStore>(&*II))) {
if (NewStore->getMMO().getMemoryType() != MemTy || !NewStore->isSimple())
break;
} else if (II->isLoadFoldBarrier() || II->mayLoad()) {
break;
} else {
continue; // This is a safe instruction we can look past.
}

Register NewBaseReg;
int64_t MemOffset;
// Check we're storing to the same base + some offset.
if (!mi_match(NewStore->getPointerReg(), MRI,
m_GPtrAdd(m_Reg(NewBaseReg), m_ICst(MemOffset)))) {
NewBaseReg = NewStore->getPointerReg();
MemOffset = 0;
}
if (BaseReg != NewBaseReg)
break;

auto ShiftByteOffset = getTruncStoreByteOffset(*NewStore, WideSrcVal, MRI);
if (!ShiftByteOffset)
break;
if (MemOffset < LowestIdxOffset) {
LowestIdxOffset = MemOffset;
LowestIdxStore = NewStore;
}

// Map the offset in the store and the offset in the combined value, and
// early return if it has been set before.
if (*ShiftByteOffset < 0 || *ShiftByteOffset >= NumStoresRequired ||
OffsetMap[*ShiftByteOffset] != INT64_MAX)
break;
OffsetMap[*ShiftByteOffset] = MemOffset;

FoundStores.emplace_back(NewStore);
// Reset counter since we've found a matching inst.
NumInstsChecked = 0;
if (FoundStores.size() == NumStoresRequired)
break;
}

if (FoundStores.size() != NumStoresRequired) {
return false;
}

const auto &DL = LastStore.getMF()->getDataLayout();
auto &C = LastStore.getMF()->getFunction().getContext();
// Check that a store of the wide type is both allowed and fast on the target
bool Fast = false;
bool Allowed = getTargetLowering().allowsMemoryAccess(
C, DL, WideStoreTy, LowestIdxStore->getMMO(), &Fast);
if (!Allowed || !Fast)
return false;

// Check if the pieces of the value are going to the expected places in memory
// to merge the stores.
unsigned NarrowBits = MemTy.getScalarSizeInBits();
auto checkOffsets = [&](bool MatchLittleEndian) {
if (MatchLittleEndian) {
for (unsigned i = 0; i != NumStoresRequired; ++i)
if (OffsetMap[i] != i * (NarrowBits / 8) + LowestIdxOffset)
return false;
} else { // MatchBigEndian by reversing loop counter.
for (unsigned i = 0, j = NumStoresRequired - 1; i != NumStoresRequired;
++i, --j)
if (OffsetMap[j] != i * (NarrowBits / 8) + LowestIdxOffset)
return false;
}
return true;
};

// Check if the offsets line up for the native data layout of this target.
bool NeedBswap = false;
bool NeedRotate = false;
if (!checkOffsets(DL.isLittleEndian())) {
// Special-case: check if byte offsets line up for the opposite endian.
if (NarrowBits == 8 && checkOffsets(DL.isBigEndian()))
NeedBswap = true;
else if (NumStoresRequired == 2 && checkOffsets(DL.isBigEndian()))
NeedRotate = true;
else
return false;
}

if (NeedBswap &&
!isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {WideStoreTy}}))
return false;
if (NeedRotate &&
!isLegalOrBeforeLegalizer({TargetOpcode::G_ROTR, {WideStoreTy}}))
return false;

MatchInfo.NeedBSwap = NeedBswap;
MatchInfo.NeedRotate = NeedRotate;
MatchInfo.LowestIdxStore = LowestIdxStore;
MatchInfo.WideSrcVal = WideSrcVal;
MatchInfo.FoundStores = std::move(FoundStores);
return true;
}

void CombinerHelper::applyTruncStoreMerge(MachineInstr &MI,
MergeTruncStoresInfo &MatchInfo) {

Builder.setInstrAndDebugLoc(MI);
Register WideSrcVal = MatchInfo.WideSrcVal;
LLT WideStoreTy = MRI.getType(WideSrcVal);

if (MatchInfo.NeedBSwap) {
WideSrcVal = Builder.buildBSwap(WideStoreTy, WideSrcVal).getReg(0);
} else if (MatchInfo.NeedRotate) {
assert(WideStoreTy.getSizeInBits() % 2 == 0 &&
"Unexpected type for rotate");
auto RotAmt =
Builder.buildConstant(WideStoreTy, WideStoreTy.getSizeInBits() / 2);
WideSrcVal =
Builder.buildRotateRight(WideStoreTy, WideSrcVal, RotAmt).getReg(0);
}

Builder.buildStore(WideSrcVal, MatchInfo.LowestIdxStore->getPointerReg(),
MatchInfo.LowestIdxStore->getMMO().getPointerInfo(),
MatchInfo.LowestIdxStore->getMMO().getAlign());

// Erase the old stores.
for (auto *ST : MatchInfo.FoundStores)
ST->eraseFromParent();
}

bool CombinerHelper::matchExtendThroughPhis(MachineInstr &MI,
MachineInstr *&ExtMI) {
assert(MI.getOpcode() == TargetOpcode::G_PHI);
Expand Down

0 comments on commit eae44c8

Please sign in to comment.