Skip to content

Commit

Permalink
[X86] Remove isel pattern for MMX_X86movdq2q+simple_load. Replace wit…
Browse files Browse the repository at this point in the history
…h DAG combine to to loadmmx.

Only 64-bit bits will be loaded, not the whole 128 bits. We can
just combine it to plain mmx load. This has the side effect of
enabling isel load folding for it.

This part of my desire to get rid of isel patterns that shrink loads.
  • Loading branch information
topperc committed May 29, 2020
1 parent dbb5979 commit 87e4ad4
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 5 deletions.
22 changes: 22 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -47736,6 +47736,27 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
return DAG.getBitcast(VT, Cvt);
}

static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
SDValue Src = N->getOperand(0);

// Turn MOVDQ2Q+simple_load into an mmx load.
if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());

if (LN->isSimple()) {
SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
LN->getBasePtr(),
LN->getPointerInfo(),
LN->getOriginalAlign(),
LN->getMemOperand()->getFlags());
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
return NewLd;
}
}

return SDValue();
}

SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
Expand Down Expand Up @@ -47898,6 +47919,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
}

return SDValue();
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Target/X86/X86InstrMMX.td
Expand Up @@ -568,9 +568,6 @@ def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
(x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;

def : Pat<(x86mmx (MMX_X86movdq2q (v2i64 (simple_load addr:$src)))),
(x86mmx (MMX_MOVQ64rm addr:$src))>;

def : Pat<(v2i64 (X86vzmovl (scalar_to_vector
(i64 (bitconvert (x86mmx VR64:$src)))))),
(MMX_MOVQ2DQrr VR64:$src)>;
Expand Down
31 changes: 29 additions & 2 deletions llvm/test/CodeGen/X86/mmx-fold-load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -disable-peephole -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64

define i64 @t0(<1 x i64>* %a, i32* %b) nounwind {
; X86-LABEL: t0:
Expand Down Expand Up @@ -616,3 +616,30 @@ entry:

declare void @llvm.lifetime.start(i64, i8* nocapture)
declare void @llvm.lifetime.end(i64, i8* nocapture)

; Make sure we shrink this vector load and fold it.
define x86_mmx @vec_load(<4 x float>* %x) {
; X86-LABEL: vec_load:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: pshufw $68, (%eax), %mm0 # mm0 = mem[0,1,0,1]
; X86-NEXT: paddsb %mm0, %mm0
; X86-NEXT: retl
;
; X64-LABEL: vec_load:
; X64: # %bb.0:
; X64-NEXT: pshufw $68, (%rdi), %mm0 # mm0 = mem[0,1,0,1]
; X64-NEXT: paddsb %mm0, %mm0
; X64-NEXT: movq2dq %mm0, %xmm0
; X64-NEXT: retq
%z = load <4 x float>, <4 x float>* %x
%y = extractelement <4 x float> %z, i32 0
%a = insertelement <2 x float> undef, float %y, i32 0
%b = insertelement <2 x float> %a, float %y, i32 1
%c = bitcast <2 x float> %b to x86_mmx
%d = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %c, x86_mmx %c)
ret x86_mmx %d
}

declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)

0 comments on commit 87e4ad4

Please sign in to comment.