Skip to content

Commit

Permalink
[X86] Memory folding for commutative instructions (updated)
Browse files Browse the repository at this point in the history
This patch improves support for commutative instructions in the x86 memory folding implementation by attempting to fold a commuted version of the instruction if the original folding fails - if that folding fails as well the instruction is 're-commuted' back to its original order before returning.

Updated version of r219584 (reverted in r219595) - the commutation attempt now explicitly ensures that neither of the commuted source operands are tied to the destination operand / register, which was the source of all the regressions that occurred with the original patch attempt.

Added additional regression test case provided by Joerg Sonnenberger.

Differential Revision: http://reviews.llvm.org/D5818

llvm-svn: 220239
  • Loading branch information
RKSimon committed Oct 20, 2014
1 parent c1fdf7f commit 2f9548a
Show file tree
Hide file tree
Showing 5 changed files with 202 additions and 59 deletions.
3 changes: 2 additions & 1 deletion llvm/lib/Target/X86/X86FastISel.cpp
Expand Up @@ -3337,7 +3337,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
AM.getFullAddress(AddrOps);

MachineInstr *Result =
XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps,
Size, Alignment, /*AllowCommute=*/true);
if (!Result)
return false;

Expand Down
155 changes: 98 additions & 57 deletions llvm/lib/Target/X86/X86InstrInfo.cpp
Expand Up @@ -3907,10 +3907,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
/// operand at the use. We fold the load instructions if load defines a virtual
/// register, the virtual register is used once in the same BB, and the
/// instructions in-between do not load or store, and have no side effects.
MachineInstr* X86InstrInfo::
optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
unsigned &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
const MachineRegisterInfo *MRI,
unsigned &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
if (FoldAsLoadDefReg == 0)
return nullptr;
// To be conservative, if there exists another load, clear the load candidate.
Expand All @@ -3926,55 +3926,35 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
if (!DefMI->isSafeToMove(this, nullptr, SawStore))
return nullptr;

// We try to commute MI if possible.
unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1;
for (unsigned Idx = 0; Idx < IdxEnd; Idx++) {
// Collect information about virtual register operands of MI.
unsigned SrcOperandId = 0;
bool FoundSrcOperand = false;
for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
if (Reg != FoldAsLoadDefReg)
continue;
// Do not fold if we have a subreg use or a def or multiple uses.
if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
return nullptr;

SrcOperandId = i;
FoundSrcOperand = true;
}
if (!FoundSrcOperand) return nullptr;

// Check whether we can fold the def into SrcOperandId.
SmallVector<unsigned, 8> Ops;
Ops.push_back(SrcOperandId);
MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
if (FoldMI) {
FoldAsLoadDefReg = 0;
return FoldMI;
}

if (Idx == 1) {
// MI was changed but it didn't help, commute it back!
commuteInstruction(MI, false);
// Collect information about virtual register operands of MI.
unsigned SrcOperandId = 0;
bool FoundSrcOperand = false;
for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
if (Reg != FoldAsLoadDefReg)
continue;
// Do not fold if we have a subreg use or a def or multiple uses.
if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
return nullptr;
}

// Check whether we can commute MI and enable folding.
if (MI->isCommutable()) {
MachineInstr *NewMI = commuteInstruction(MI, false);
// Unable to commute.
if (!NewMI) return nullptr;
if (NewMI != MI) {
// New instruction. It doesn't need to be kept.
NewMI->eraseFromParent();
return nullptr;
}
}
SrcOperandId = i;
FoundSrcOperand = true;
}
if (!FoundSrcOperand)
return nullptr;

// Check whether we can fold the def into SrcOperandId.
SmallVector<unsigned, 8> Ops;
Ops.push_back(SrcOperandId);
MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
if (FoldMI) {
FoldAsLoadDefReg = 0;
return FoldMI;
}

return nullptr;
}

Expand Down Expand Up @@ -4134,7 +4114,8 @@ MachineInstr*
X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
MachineInstr *MI, unsigned i,
const SmallVectorImpl<MachineOperand> &MOs,
unsigned Size, unsigned Align) const {
unsigned Size, unsigned Align,
bool AllowCommute) const {
const DenseMap<unsigned,
std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
bool isCallRegIndirect = Subtarget.callRegIndirect();
Expand Down Expand Up @@ -4202,8 +4183,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
return nullptr;
// If this is a 64-bit load, but the spill slot is 32, then we can do
// a 32-bit load which is implicitly zero-extended. This likely is due
// to liveintervalanalysis remat'ing a load from stack slot.
// a 32-bit load which is implicitly zero-extended. This likely is
// due to live interval analysis remat'ing a load from stack slot.
if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
return nullptr;
Opcode = X86::MOV32rm;
Expand All @@ -4222,15 +4203,73 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
// to a 32-bit one.
unsigned DstReg = NewMI->getOperand(0).getReg();
if (TargetRegisterInfo::isPhysicalRegister(DstReg))
NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
X86::sub_32bit));
NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
else
NewMI->getOperand(0).setSubReg(X86::sub_32bit);
}
return NewMI;
}
}

// If the instruction and target operand are commutable, commute the
// instruction and try again.
if (AllowCommute) {
unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2;
if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
bool HasDef = MI->getDesc().getNumDefs();
unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
bool Tied0 =
0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
bool Tied1 =
0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);

// If either of the commutable operands are tied to the destination
// then we can not commute + fold.
if ((HasDef && Reg0 == Reg1 && Tied0) ||
(HasDef && Reg0 == Reg2 && Tied1))
return nullptr;

if ((CommuteOpIdx1 == OriginalOpIdx) ||
(CommuteOpIdx2 == OriginalOpIdx)) {
MachineInstr *CommutedMI = commuteInstruction(MI, false);
if (!CommutedMI) {
// Unable to commute.
return nullptr;
}
if (CommutedMI != MI) {
// New instruction. We can't fold from this.
CommutedMI->eraseFromParent();
return nullptr;
}

// Attempt to fold with the commuted version of the instruction.
unsigned CommuteOp =
(CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align,
/*AllowCommute=*/false);
if (NewMI)
return NewMI;

// Folding failed again - undo the commute before returning.
MachineInstr *UncommutedMI = commuteInstruction(MI, false);
if (!UncommutedMI) {
// Unable to commute.
return nullptr;
}
if (UncommutedMI != MI) {
// New instruction. It doesn't need to be kept.
UncommutedMI->eraseFromParent();
return nullptr;
}

// Return here to prevent duplicate fuse failure report.
return nullptr;
}
}
}

// No fusion
if (PrintFailedFusing && !MI->isCopy())
dbgs() << "We failed to fuse operand " << i << " in " << *MI;
Expand Down Expand Up @@ -4440,7 +4479,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,

SmallVector<MachineOperand,4> MOs;
MOs.push_back(MachineOperand::CreateFI(FrameIndex));
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment);
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
Size, Alignment, /*AllowCommute=*/true);
}

static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
Expand Down Expand Up @@ -4593,7 +4633,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
break;
}
}
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment);
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
/*Size=*/0, Alignment, /*AllowCommute=*/true);
}


Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/X86/X86InstrInfo.h
Expand Up @@ -404,7 +404,8 @@ class X86InstrInfo final : public X86GenInstrInfo {
MachineInstr* MI,
unsigned OpNum,
const SmallVectorImpl<MachineOperand> &MOs,
unsigned Size, unsigned Alignment) const;
unsigned Size, unsigned Alignment,
bool AllowCommute) const;

void
getUnconditionalBranch(MCInst &Branch,
Expand Down
16 changes: 16 additions & 0 deletions llvm/test/CodeGen/X86/avx1-stack-reload-folding.ll
@@ -0,0 +1,16 @@
; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"

; Function Attrs: nounwind readonly uwtable
define <32 x double> @_Z14vstack_foldDv32_dS_(<32 x double> %a, <32 x double> %b) #0 {
%1 = fadd <32 x double> %a, %b
%2 = fsub <32 x double> %a, %b
%3 = fmul <32 x double> %1, %2
ret <32 x double> %3

;CHECK-NOT: vmovapd {{.*#+}} 32-byte Reload
;CHECK: vmulpd {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
;CHECK-NOT: vmovapd {{.*#+}} 32-byte Reload
}
84 changes: 84 additions & 0 deletions llvm/test/CodeGen/X86/fold-tied-op.ll
@@ -0,0 +1,84 @@
; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s
; Regression test for http://reviews.llvm.org/D5701

; ModuleID = 'xxhash.i'
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
target triple = "i386--netbsd"

; CHECK-LABEL: fn1
; CHECK: shldl {{.*#+}} 4-byte Folded Spill
; CHECK: orl {{.*#+}} 4-byte Folded Reload
; CHECK: shldl {{.*#+}} 4-byte Folded Spill
; CHECK: orl {{.*#+}} 4-byte Folded Reload
; CHECK: addl {{.*#+}} 4-byte Folded Reload
; CHECK: imull {{.*#+}} 4-byte Folded Reload
; CHECK: orl {{.*#+}} 4-byte Folded Reload
; CHECK: retl

%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }

@a = common global i32 0, align 4
@b = common global i64 0, align 8

; Function Attrs: nounwind uwtable
define i64 @fn1() #0 {
entry:
%0 = load i32* @a, align 4, !tbaa !1
%1 = inttoptr i32 %0 to %struct.XXH_state64_t*
%total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0
%2 = load i32* %total_len, align 4, !tbaa !5
%tobool = icmp eq i32 %2, 0
br i1 %tobool, label %if.else, label %if.then

if.then: ; preds = %entry
%v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3
%3 = load i64* %v3, align 4, !tbaa !8
%v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4
%4 = load i64* %v4, align 4, !tbaa !9
%v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2
%5 = load i64* %v2, align 4, !tbaa !10
%shl = shl i64 %5, 1
%or = or i64 %shl, %5
%shl2 = shl i64 %3, 2
%shr = lshr i64 %3, 1
%or3 = or i64 %shl2, %shr
%add = add i64 %or, %or3
%mul = mul i64 %4, -4417276706812531889
%shl4 = mul i64 %4, -8834553413625063778
%shr5 = ashr i64 %mul, 3
%or6 = or i64 %shr5, %shl4
%mul7 = mul nsw i64 %or6, 1400714785074694791
%xor = xor i64 %add, %mul7
store i64 %xor, i64* @b, align 8, !tbaa !11
%mul8 = mul nsw i64 %xor, 1400714785074694791
br label %if.end

if.else: ; preds = %entry
%6 = load i64* @b, align 8, !tbaa !11
%xor10 = xor i64 %6, -4417276706812531889
%mul11 = mul nsw i64 %xor10, 400714785074694791
br label %if.end

if.end: ; preds = %if.else, %if.then
%storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ]
%storemerge = add i64 %storemerge.in, -8796714831421723037
store i64 %storemerge, i64* @b, align 8, !tbaa !11
ret i64 undef
}

attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = metadata !{metadata !"clang version 3.6 (trunk 219587)"}
!1 = metadata !{metadata !2, metadata !2, i64 0}
!2 = metadata !{metadata !"int", metadata !3, i64 0}
!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
!4 = metadata !{metadata !"Simple C/C++ TBAA"}
!5 = metadata !{metadata !6, metadata !2, i64 0}
!6 = metadata !{metadata !"XXH_state64_t", metadata !2, i64 0, metadata !2, i64 4, metadata !7, i64 8, metadata !7, i64 16, metadata !7, i64 24}
!7 = metadata !{metadata !"long long", metadata !3, i64 0}
!8 = metadata !{metadata !6, metadata !7, i64 16}
!9 = metadata !{metadata !6, metadata !7, i64 24}
!10 = metadata !{metadata !6, metadata !7, i64 8}
!11 = metadata !{metadata !7, metadata !7, i64 0}

0 comments on commit 2f9548a

Please sign in to comment.