-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[X86] narrowBitOpRMW - add handling for single bit insertion patterns #165742
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Insertion of a single bit into a large integer is typically canonicalized to "(X & ~(1 << ShAmt)) | (InsertBit << ShAmt)", which can be simplified to modify the i32 block as a BTR followed by an OR((i32)InsertBit << (ShAmt % 32). We must ensure that the InsertBit is zero apart from the LSB so we can cheaply truncate it to work with the i32 block like the simpler BT patterns.
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesInsertion of a single bit into a large integer is typically canonicalized to "(X & ~(1 << ShAmt)) | (InsertBit << ShAmt)", which can be simplified to modify the i32 block as a BTR followed by an OR((i32)InsertBit << (ShAmt % 32). We must ensure that the InsertBit is zero apart from the LSB so we can cheaply truncate it to work with the i32 block like the simpler BT patterns. Patch is 43.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165742.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03..5ad41b2ddf06e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53502,7 +53502,8 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
}
// Look for a RMW operation that only touches one bit of a larger than legal
-// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value.
+// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
+// i32 sub value.
static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -53528,14 +53529,20 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
// BTR: X & ~(1 << ShAmt)
// BTS: X | (1 << ShAmt)
// BTC: X ^ (1 << ShAmt)
- SDValue ShAmt;
+ //
+ // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
+ SDValue InsertBit, ShAmt;
if (!StoredVal.hasOneUse() ||
!(sd_match(StoredVal, m_And(m_Specific(LoadVal),
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
sd_match(StoredVal,
m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
sd_match(StoredVal,
- m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt))))))
+ m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ sd_match(StoredVal,
+ m_Or(m_And(m_Specific(LoadVal),
+ m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+ m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
return SDValue();
// Ensure the shift amount is in bounds.
@@ -53543,6 +53550,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
return SDValue();
+ // If we're inserting a bit then it must be the LSB.
+ if (InsertBit) {
+ KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
+ if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
+ return SDValue();
+ }
+
// Split the shift into an alignment shift that moves the active i32 block to
// the bottom bits for truncation and a modulo shift that can act on the i32.
EVT AmtVT = ShAmt.getValueType();
@@ -53550,6 +53564,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
DAG.getSignedConstant(-32LL, DL, AmtVT));
SDValue ModuloAmt =
DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
+ ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
// Compute the byte offset for the i32 block that is changed by the RMW.
// combineTruncate will adjust the load for us in a similar way.
@@ -53564,13 +53579,23 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
- SDValue Mask =
- DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
- DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
- if (StoredVal.getOpcode() == ISD::AND)
- Mask = DAG.getNOT(DL, Mask, MVT::i32);
+ SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+ DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
+
+ SDValue Res;
+ if (InsertBit) {
+ SDValue BitMask =
+ DAG.getNode(ISD::SHL, DL, MVT::i32,
+ DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
+ Res =
+ DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
+ Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
+ } else {
+ if (StoredVal.getOpcode() == ISD::AND)
+ Mask = DAG.getNOT(DL, Mask, MVT::i32);
+ Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
+ }
- SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
Align(), St->getMemOperand()->getFlags());
}
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index cc3dcf32ac0eb..63047ef389ed9 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX
; bt/btc/btr/bts patterns + 'init' to set single bit value in large integers
@@ -356,41 +356,20 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %edx
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB9_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: .LBB9_2:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: notl %esi
-; X86-NEXT: notl %edx
-; X86-NEXT: je .LBB9_4
-; X86-NEXT: # %bb.3:
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB9_4:
-; X86-NEXT: andl 4(%ebx), %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: andl (%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl $32, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl (%ebx,%eax), %eax
-; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
-; X86-NEXT: movl %esi, 4(%ebx)
-; X86-NEXT: movl %edx, (%ebx)
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -600,201 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $96, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movzbl 16(%ebp), %ebx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movl 72(%esp,%edi), %edx
-; X86-NEXT: movl 76(%esp,%edi), %esi
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 64(%esp,%edi), %ebx
-; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT: movl 68(%esp,%edi), %ebx
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shldl %cl, %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: notl %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl 40(%esp,%eax), %edi
-; X86-NEXT: movl 44(%esp,%eax), %esi
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 12(%ecx), %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: notl %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl 36(%esp,%esi), %esi
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: andl 8(%edx), %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl 32(%esp,%eax), %eax
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl 8(%ebp), %edi
-; X86-NEXT: andl 4(%edi), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: andl (%edi), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl $96, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl (%edi,%eax), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 12(%edi)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 8(%edi)
-; X86-NEXT: movl %ebx, 4(%edi)
-; X86-NEXT: movl %edx, (%edi)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i128:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %esi
-; SSE-NEXT: xorl %r8d, %r8d
-; SSE-NEXT: shldq %cl, %rsi, %r8
-; SSE-NEXT: shlq %cl, %rsi
-; SSE-NEXT: movl %edx, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: xorl %r9d, %r9d
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rsi, %r8
-; SSE-NEXT: cmovneq %r9, %rsi
-; SSE-NEXT: notq %r8
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: cmovneq %r9, %rax
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: andq 8(%rdi), %r8
-; SSE-NEXT: orq %rdx, %r8
-; SSE-NEXT: andq (%rdi), %rsi
-; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: movl %ecx, %eax
-; SSE-NEXT: andl $96, %eax
-; SSE-NEXT: shrl $3, %eax
-; SSE-NEXT: movl (%rdi,%rax), %eax
-; SSE-NEXT: btl %ecx, %eax
+; SSE-NEXT: andl $96, %esi
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: movl (%rdi,%rsi), %r8d
+; SSE-NEXT: btl %ecx, %r8d
; SSE-NEXT: setae %al
-; SSE-NEXT: movq %r8, 8(%rdi)
-; SSE-NEXT: movq %rsi, (%rdi)
+; SSE-NEXT: shll %cl, %edx
+; SSE-NEXT: btrl %ecx, %r8d
+; SSE-NEXT: orl %r8d, %edx
+; SSE-NEXT: movl %edx, (%rdi,%rsi)
; SSE-NEXT: retq
;
-; AVX2-LABEL: init_eq_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl $1, %eax
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %rax, %rsi
-; AVX2-NEXT: movl %edx, %edx
-; AVX2-NEXT: xorl %r8d, %r8d
-; AVX2-NEXT: shldq %cl, %rdx, %r8
-; AVX2-NEXT: xorl %r9d, %r9d
-; AVX2-NEXT: shlxq %rcx, %rax, %rax
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rax, %rsi
-; AVX2-NEXT: cmovneq %r9, %rax
-; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT: cmovneq %rdx, %r8
-; AVX2-NEXT: cmovneq %r9, %rdx
-; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: andnq (%rdi), %rax, %r8
-; AVX2-NEXT: orq %rdx, %r8
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $96, %eax
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: movl (%rdi,%rax), %eax
-; AVX2-NEXT: btl %ecx, %eax
-; AVX2-NEXT: setae %al
-; AVX2-NEXT: movq %rsi, 8(%rdi)
-; AVX2-NEXT: movq %r8, (%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: init_eq_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %eax
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %rax, %rsi
-; AVX512-NEXT: xorl %r8d, %r8d
-; AVX512-NEXT: shlxq %rcx, %rax, %rax
-; AVX512-NEXT: movl %edx, %edx
-; AVX512-NEXT: xorl %r9d, %r9d
-; AVX512-NEXT: shldq %cl, %rdx, %r9
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %rax, %rsi
-; AVX512-NEXT: cmovneq %r8, %rax
-; AVX512-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT: cmovneq %rdx, %r9
-; AVX512-NEXT: cmovneq %r8, %rdx
-; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi
-; AVX512-NEXT: orq %r9, %rsi
-; AVX512-NEXT: andnq (%rdi), %rax, %r8
-; AVX512-NEXT: orq %rdx, %r8
-; AVX512-NEXT: movl %ecx, %eax
-; AVX512-NEXT: andl $96, %eax
-; AVX512-NEXT: shrl $3, %eax
-; AVX512-NEXT: movl (%rdi,%rax), %eax
-; AVX512-NEXT: btl %ecx, %eax
-; AVX512-NEXT: setae %al
-; AVX512-NEXT: movq %rsi, 8(%rdi)
-; AVX512-NEXT: movq %r8, (%rdi)
-; AVX512-NEXT: retq
+; AVX-LABEL: init_eq_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: andl $96, %ecx
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: movl (%rdi,%rcx), %r8d
+; AVX-NEXT: btl %esi, %r8d
+; AVX-NEXT: setae %al
+; AVX-NEXT: btrl %esi, %r8d
+; AVX-NEXT: shlxl %esi, %edx, %edx
+; AVX-NEXT: orl %r8d, %edx
+; AVX-NEXT: movl %edx, (%rdi,%rcx)
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -970,665 +803,55 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $352, %esp # imm = 0x160
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: andl $60, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: subl %edx, %eax
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 56(%eax), %esi
-; X86-NEXT: movl 60(%eax), %ebx
-; X86-NEXT: movl 52(%eax), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%eax), %edi
-; X86-NEXT: movl 44(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl 16(%ebp), %eax
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
These were removed in llvm#165742 but separate tests from llvm#165758 still need them
|
@RKSimon Reverted to unblock another revert. |
…llvm#165742) Insertion of a single bit into a large integer is typically canonicalized to "(X & ~(1 << ShAmt)) | (InsertBit << ShAmt)", which can be simplified to modify the i32 block as a BTR followed by an OR((i32)InsertBit << (ShAmt % 32). We must ensure that the InsertBit is zero apart from the LSB so we can cheaply truncate it to work with the i32 block like the simpler BT patterns.
…65856) These were removed in llvm#165742 but separate tests from llvm#165758 still need them
…patterns (llvm#165742)" (llvm#165978) This reverts commit 2108c62. llvm#165742 blocks revert of llvm#165540.
Insertion of a single bit into a large integer is typically canonicalized to "(X & ~(1 << ShAmt)) | (InsertBit << ShAmt)", which can be simplified to modify the i32 block as a BTR followed by an OR((i32)InsertBit << (ShAmt % 32).
We must ensure that the InsertBit is zero apart from the LSB so we can cheaply truncate it to work with the i32 block like the simpler BT patterns.