-
Notifications
You must be signed in to change notification settings - Fork 15k
[X86] narrowBitOpRMW - use reachesChainWithoutSideEffects instead of direct chain matching #165870
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…direct chain matching This will allow us to match RMW load/store chains through TokenFactor nodes if there are additional loads in the chain before the store
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesThis will allow us to match RMW load/store chains through TokenFactor nodes if there are additional loads in the chain before the store Full diff: https://github.com/llvm/llvm-project/pull/165870.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 007074c3ffc82..62d68f45df177 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53351,21 +53351,11 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
using namespace SDPatternMatch;
-
- // Only handle normal stores and its chain was a matching normal load.
- auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
- if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
- !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
- Ld->getBasePtr() != St->getBasePtr() ||
- Ld->getOffset() != St->getOffset())
- return SDValue();
-
- SDValue LoadVal(Ld, 0);
SDValue StoredVal = St->getValue();
EVT VT = StoredVal.getValueType();
- // Only narrow larger than legal scalar integers.
- if (!VT.isScalarInteger() ||
+ // Only narrow normal stores of larger than legal scalar integers.
+ if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() ||
VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
return SDValue();
@@ -53374,18 +53364,26 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
// BTC: X ^ (1 << ShAmt)
//
// BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
- SDValue InsertBit, ShAmt;
+ SDValue SrcVal, InsertBit, ShAmt;
if (!StoredVal.hasOneUse() ||
- !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
+ !(sd_match(StoredVal, m_And(m_Value(SrcVal),
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
sd_match(StoredVal,
- m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
sd_match(StoredVal,
- m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
- sd_match(StoredVal,
- m_Or(m_And(m_Specific(LoadVal),
- m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
- m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+ m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ sd_match(
+ StoredVal,
+ m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+ m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+ return SDValue();
+
+ // SrcVal must be a matching normal load further up the chain.
+ auto *Ld = dyn_cast<LoadSDNode>(SrcVal);
+ if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
+ Ld->getBasePtr() != St->getBasePtr() ||
+ Ld->getOffset() != St->getOffset() ||
+ !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1)))
return SDValue();
// Ensure the shift amount is in bounds.
@@ -53419,7 +53417,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SDNodeFlags::NoUnsignedWrap);
// Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
- SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
+ SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt);
X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 5776c6c82bcc3..c197a83835506 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
; bt/btc/btr/bts patterns + 'init' to set single bit value in large integers
@@ -1029,151 +1029,46 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-LABEL: reset_multiload_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movl 36(%esp,%edi), %edx
-; X86-NEXT: movl 40(%esp,%edi), %ebx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl 32(%esp,%edi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esp,%edi), %edi
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl (%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebp), %eax
-; X86-NEXT: andl $96, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl (%ecx,%eax), %eax
-; X86-NEXT: andl %ebx, (%ecx)
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edx
-; X86-NEXT: notl %edx
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: andl %edx, 4(%ebx)
-; X86-NEXT: notl %esi
-; X86-NEXT: andl %esi, 8(%ebx)
-; X86-NEXT: notl %edi
-; X86-NEXT: andl %edi, 12(%ebx)
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: btrl %edx, %ebx
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: movl %ebx, (%ecx,%esi)
; X86-NEXT: jae .LBB22_2
; X86-NEXT: # %bb.1:
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB22_2:
-; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: reset_multiload_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %esi
-; SSE-NEXT: xorl %r8d, %r8d
-; SSE-NEXT: shldq %cl, %rsi, %r8
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: shlq %cl, %rsi
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rsi, %r8
-; SSE-NEXT: cmovneq %rax, %rsi
-; SSE-NEXT: notq %r8
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: movl %ecx, %r9d
-; SSE-NEXT: andl $96, %r9d
-; SSE-NEXT: shrl $3, %r9d
-; SSE-NEXT: movl (%rdi,%r9), %r9d
-; SSE-NEXT: btl %ecx, %r9d
-; SSE-NEXT: jb .LBB22_2
-; SSE-NEXT: # %bb.1:
-; SSE-NEXT: movl (%rdx), %eax
-; SSE-NEXT: .LBB22_2:
-; SSE-NEXT: andq %r8, 8(%rdi)
-; SSE-NEXT: andq %rsi, (%rdi)
-; SSE-NEXT: # kill: def $eax killed $eax killed $rax
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: reset_multiload_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: movl $1, %r8d
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %r8, %rsi
-; AVX2-NEXT: shlxq %rcx, %r8, %r8
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %r8, %rsi
-; AVX2-NEXT: cmovneq %rax, %r8
-; AVX2-NEXT: notq %rsi
-; AVX2-NEXT: notq %r8
-; AVX2-NEXT: movl %ecx, %r9d
-; AVX2-NEXT: andl $96, %r9d
-; AVX2-NEXT: shrl $3, %r9d
-; AVX2-NEXT: movl (%rdi,%r9), %r9d
-; AVX2-NEXT: btl %ecx, %r9d
-; AVX2-NEXT: jb .LBB22_2
-; AVX2-NEXT: # %bb.1:
-; AVX2-NEXT: movl (%rdx), %eax
-; AVX2-NEXT: .LBB22_2:
-; AVX2-NEXT: andq %rsi, 8(%rdi)
-; AVX2-NEXT: andq %r8, (%rdi)
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: reset_multiload_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %r8d
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %r8, %rsi
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: shlxq %rcx, %r8, %r8
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %r8, %rsi
-; AVX512-NEXT: cmovneq %rax, %r8
-; AVX512-NEXT: notq %rsi
-; AVX512-NEXT: notq %r8
-; AVX512-NEXT: movl %ecx, %r9d
-; AVX512-NEXT: andl $96, %r9d
-; AVX512-NEXT: shrl $3, %r9d
-; AVX512-NEXT: movl (%rdi,%r9), %r9d
-; AVX512-NEXT: btl %ecx, %r9d
-; AVX512-NEXT: jb .LBB22_2
-; AVX512-NEXT: # %bb.1:
-; AVX512-NEXT: movl (%rdx), %eax
-; AVX512-NEXT: .LBB22_2:
-; AVX512-NEXT: andq %rsi, 8(%rdi)
-; AVX512-NEXT: andq %r8, (%rdi)
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: retq
+; X64-LABEL: reset_multiload_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %r9d
+; X64-NEXT: movl %r9d, %r8d
+; X64-NEXT: btrl %esi, %r8d
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: btl %esi, %r9d
+; X64-NEXT: jb .LBB22_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: .LBB22_2:
+; X64-NEXT: movl %r8d, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
This will allow us to match RMW load/store chains through TokenFactor nodes if there are additional loads in the chain before the store