Skip to content

Commit

Permalink
[X86] Teach shouldSinkOperands to recognize pmuldq/pmuludq patterns.
Browse files Browse the repository at this point in the history
The IR for pmuldq/pmuludq intrinsics uses a sext_inreg/zext_inreg
pattern on the inputs. Ideally we pattern match these away during
isel. It is possible for LICM or other middle end optimizations
to separate the extend from the mul. This prevents SelectionDAG
from removing it or depending on how the extend is lowered, we
may not be able to generate an AssertSExt/AssertZExt in the
mul basic block. This will prevent pmuldq/pmuludq from being
formed at all.

This patch teaches shouldSinkOperands to recognize this so
that CodeGenPrepare will clone the extend into the same basic
block as the mul.

Fixes PR51371.

Differential Revision: https://reviews.llvm.org/D107689
  • Loading branch information
topperc committed Aug 7, 2021
1 parent 8a2d1b1 commit 24dfba8
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 24 deletions.
33 changes: 32 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -48,9 +48,10 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
Expand Down Expand Up @@ -32091,6 +32092,36 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

bool X86TargetLowering::shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const {
using namespace llvm::PatternMatch;

FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
if (!VTy)
return false;

if (I->getOpcode() == Instruction::Mul &&
VTy->getElementType()->isIntegerTy(64)) {
for (auto &Op : I->operands()) {
// Make sure we are not already sinking this operand
if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
continue;

// Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
// the PMULUDQ pattern where the input is a zext_inreg from vXi32.
if (Subtarget.hasSSE41() &&
match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
m_SpecificInt(32)))) {
Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
Ops.push_back(&Op);
} else if (Subtarget.hasSSE2() &&
match(Op.get(),
m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
Ops.push_back(&Op);
}
}

return !Ops.empty();
}

// A uniform shift amount in a vector shift or funnel shift may be much
// cheaper than a generic variable vector shift, so make that pattern visible
// to SDAG by sinking the shuffle instruction next to the shift.
Expand Down
28 changes: 5 additions & 23 deletions llvm/test/CodeGen/X86/pr51371.ll
Expand Up @@ -8,28 +8,12 @@ define void @pmuldq(<2 x i64>* nocapture %0, i32 %1, i64 %2) {
; CHECK-NEXT: je .LBB0_3
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: movd %esi, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: psllq $32, %xmm0
; CHECK-NEXT: psrad $31, %xmm0
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $32, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movdqa (%rdi), %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm3
; CHECK-NEXT: psllq $32, %xmm3
; CHECK-NEXT: psrad $31, %xmm3
; CHECK-NEXT: movdqa %xmm2, %xmm4
; CHECK-NEXT: pmuludq %xmm1, %xmm4
; CHECK-NEXT: psrlq $32, %xmm3
; CHECK-NEXT: pmuludq %xmm0, %xmm3
; CHECK-NEXT: paddq %xmm4, %xmm3
; CHECK-NEXT: psllq $32, %xmm3
; CHECK-NEXT: pmuludq %xmm0, %xmm2
; CHECK-NEXT: paddq %xmm3, %xmm2
; CHECK-NEXT: movdqa %xmm2, (%rdi)
; CHECK-NEXT: movdqa (%rdi), %xmm1
; CHECK-NEXT: pmuldq %xmm0, %xmm1
; CHECK-NEXT: movdqa %xmm1, (%rdi)
; CHECK-NEXT: addq $16, %rdi
; CHECK-NEXT: decq %rdx
; CHECK-NEXT: jne .LBB0_2
Expand Down Expand Up @@ -66,9 +50,7 @@ define void @pmuludq(<2 x i64>* nocapture %0, i32 %1, i64 %2) {
; CHECK-NEXT: je .LBB1_3
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: movd %esi, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; CHECK-NEXT: pxor %xmm0, %xmm0
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_2: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movdqa (%rdi), %xmm1
Expand Down

0 comments on commit 24dfba8

Please sign in to comment.