Skip to content

Commit c4ac31d

Browse files
authored
[X86] narrowBitOpRMW - use reachesChainWithoutSideEffects instead of direct chain matching (#165870)
This will allow us to match RMW load/store chains through TokenFactor nodes if there are additional loads in the chain before the store
1 parent 5b2f9b5 commit c4ac31d

File tree

2 files changed

+46
-146
lines changed

2 files changed

+46
-146
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -53355,21 +53355,11 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5335553355
SelectionDAG &DAG,
5335653356
const X86Subtarget &Subtarget) {
5335753357
using namespace SDPatternMatch;
53358-
53359-
// Only handle normal stores and its chain was a matching normal load.
53360-
auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53361-
if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
53362-
!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
53363-
Ld->getBasePtr() != St->getBasePtr() ||
53364-
Ld->getOffset() != St->getOffset())
53365-
return SDValue();
53366-
53367-
SDValue LoadVal(Ld, 0);
5336853358
SDValue StoredVal = St->getValue();
5336953359
EVT VT = StoredVal.getValueType();
5337053360

53371-
// Only narrow larger than legal scalar integers.
53372-
if (!VT.isScalarInteger() ||
53361+
// Only narrow normal stores of larger than legal scalar integers.
53362+
if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() ||
5337353363
VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
5337453364
return SDValue();
5337553365

@@ -53378,18 +53368,26 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5337853368
// BTC: X ^ (1 << ShAmt)
5337953369
//
5338053370
// BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
53381-
SDValue InsertBit, ShAmt;
53371+
SDValue SrcVal, InsertBit, ShAmt;
5338253372
if (!StoredVal.hasOneUse() ||
53383-
!(sd_match(StoredVal, m_And(m_Specific(LoadVal),
53373+
!(sd_match(StoredVal, m_And(m_Value(SrcVal),
5338453374
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
5338553375
sd_match(StoredVal,
53386-
m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
53376+
m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
5338753377
sd_match(StoredVal,
53388-
m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
53389-
sd_match(StoredVal,
53390-
m_Or(m_And(m_Specific(LoadVal),
53391-
m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
53392-
m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
53378+
m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
53379+
sd_match(
53380+
StoredVal,
53381+
m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
53382+
m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
53383+
return SDValue();
53384+
53385+
// SrcVal must be a matching normal load further up the chain.
53386+
auto *Ld = dyn_cast<LoadSDNode>(SrcVal);
53387+
if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
53388+
Ld->getBasePtr() != St->getBasePtr() ||
53389+
Ld->getOffset() != St->getOffset() ||
53390+
!St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1)))
5339353391
return SDValue();
5339453392

5339553393
// Ensure the shift amount is in bounds.
@@ -53423,7 +53421,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5342353421
SDNodeFlags::NoUnsignedWrap);
5342453422

5342553423
// Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
53426-
SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
53424+
SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt);
5342753425
X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
5342853426

5342953427
SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,

llvm/test/CodeGen/X86/bittest-big-integer.ll

Lines changed: 27 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -1029,144 +1029,46 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
10291029
define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
10301030
; X86-LABEL: reset_multiload_i128:
10311031
; X86: # %bb.0:
1032-
; X86-NEXT: pushl %ebp
1033-
; X86-NEXT: movl %esp, %ebp
10341032
; X86-NEXT: pushl %ebx
10351033
; X86-NEXT: pushl %edi
10361034
; X86-NEXT: pushl %esi
1037-
; X86-NEXT: andl $-16, %esp
1038-
; X86-NEXT: subl $64, %esp
1039-
; X86-NEXT: movl 12(%ebp), %ecx
1040-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1041-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1042-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1043-
; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
1044-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1045-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1046-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1047-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1048-
; X86-NEXT: movl %ecx, %eax
1049-
; X86-NEXT: shrb $3, %al
1050-
; X86-NEXT: andb $12, %al
1051-
; X86-NEXT: negb %al
1052-
; X86-NEXT: movsbl %al, %eax
1053-
; X86-NEXT: movl 40(%esp,%eax), %edx
1054-
; X86-NEXT: movl 44(%esp,%eax), %esi
1055-
; X86-NEXT: shldl %cl, %edx, %esi
1056-
; X86-NEXT: movl 32(%esp,%eax), %edi
1057-
; X86-NEXT: movl 36(%esp,%eax), %ebx
1058-
; X86-NEXT: shldl %cl, %ebx, %edx
1059-
; X86-NEXT: shldl %cl, %edi, %ebx
1060-
; X86-NEXT: notl %ebx
1061-
; X86-NEXT: movl 16(%ebp), %eax
1035+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1036+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1037+
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
10621038
; X86-NEXT: movl (%eax), %eax
1063-
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1064-
; X86-NEXT: movl 8(%ebp), %eax
1065-
; X86-NEXT: andl %ebx, 4(%eax)
1066-
; X86-NEXT: shll %cl, %edi
1067-
; X86-NEXT: notl %edi
1068-
; X86-NEXT: movl %ecx, %ebx
1069-
; X86-NEXT: andl $96, %ebx
1070-
; X86-NEXT: shrl $3, %ebx
1071-
; X86-NEXT: movl (%eax,%ebx), %ebx
1072-
; X86-NEXT: andl %edi, (%eax)
1073-
; X86-NEXT: notl %esi
1074-
; X86-NEXT: andl %esi, 12(%eax)
1075-
; X86-NEXT: notl %edx
1076-
; X86-NEXT: andl %edx, 8(%eax)
1077-
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
1078-
; X86-NEXT: btl %ecx, %ebx
1039+
; X86-NEXT: movl %edx, %esi
1040+
; X86-NEXT: andl $96, %esi
1041+
; X86-NEXT: shrl $3, %esi
1042+
; X86-NEXT: movl (%ecx,%esi), %edi
1043+
; X86-NEXT: movl %edi, %ebx
1044+
; X86-NEXT: btrl %edx, %ebx
1045+
; X86-NEXT: btl %edx, %edi
1046+
; X86-NEXT: movl %ebx, (%ecx,%esi)
10791047
; X86-NEXT: jae .LBB22_2
10801048
; X86-NEXT: # %bb.1:
10811049
; X86-NEXT: xorl %eax, %eax
10821050
; X86-NEXT: .LBB22_2:
1083-
; X86-NEXT: leal -12(%ebp), %esp
10841051
; X86-NEXT: popl %esi
10851052
; X86-NEXT: popl %edi
10861053
; X86-NEXT: popl %ebx
1087-
; X86-NEXT: popl %ebp
10881054
; X86-NEXT: retl
10891055
;
1090-
; SSE-LABEL: reset_multiload_i128:
1091-
; SSE: # %bb.0:
1092-
; SSE-NEXT: movl %esi, %ecx
1093-
; SSE-NEXT: movl $1, %esi
1094-
; SSE-NEXT: xorl %r8d, %r8d
1095-
; SSE-NEXT: shldq %cl, %rsi, %r8
1096-
; SSE-NEXT: xorl %eax, %eax
1097-
; SSE-NEXT: shlq %cl, %rsi
1098-
; SSE-NEXT: testb $64, %cl
1099-
; SSE-NEXT: cmovneq %rsi, %r8
1100-
; SSE-NEXT: cmovneq %rax, %rsi
1101-
; SSE-NEXT: notq %r8
1102-
; SSE-NEXT: notq %rsi
1103-
; SSE-NEXT: movl %ecx, %r9d
1104-
; SSE-NEXT: andl $96, %r9d
1105-
; SSE-NEXT: shrl $3, %r9d
1106-
; SSE-NEXT: movl (%rdi,%r9), %r9d
1107-
; SSE-NEXT: btl %ecx, %r9d
1108-
; SSE-NEXT: jb .LBB22_2
1109-
; SSE-NEXT: # %bb.1:
1110-
; SSE-NEXT: movl (%rdx), %eax
1111-
; SSE-NEXT: .LBB22_2:
1112-
; SSE-NEXT: andq %rsi, (%rdi)
1113-
; SSE-NEXT: andq %r8, 8(%rdi)
1114-
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
1115-
; SSE-NEXT: retq
1116-
;
1117-
; AVX2-LABEL: reset_multiload_i128:
1118-
; AVX2: # %bb.0:
1119-
; AVX2-NEXT: movl %esi, %ecx
1120-
; AVX2-NEXT: xorl %eax, %eax
1121-
; AVX2-NEXT: movl $1, %r8d
1122-
; AVX2-NEXT: xorl %esi, %esi
1123-
; AVX2-NEXT: shldq %cl, %r8, %rsi
1124-
; AVX2-NEXT: shlxq %rcx, %r8, %r8
1125-
; AVX2-NEXT: testb $64, %cl
1126-
; AVX2-NEXT: cmovneq %r8, %rsi
1127-
; AVX2-NEXT: cmovneq %rax, %r8
1128-
; AVX2-NEXT: notq %rsi
1129-
; AVX2-NEXT: notq %r8
1130-
; AVX2-NEXT: movl %ecx, %r9d
1131-
; AVX2-NEXT: andl $96, %r9d
1132-
; AVX2-NEXT: shrl $3, %r9d
1133-
; AVX2-NEXT: movl (%rdi,%r9), %r9d
1134-
; AVX2-NEXT: btl %ecx, %r9d
1135-
; AVX2-NEXT: jb .LBB22_2
1136-
; AVX2-NEXT: # %bb.1:
1137-
; AVX2-NEXT: movl (%rdx), %eax
1138-
; AVX2-NEXT: .LBB22_2:
1139-
; AVX2-NEXT: andq %r8, (%rdi)
1140-
; AVX2-NEXT: andq %rsi, 8(%rdi)
1141-
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
1142-
; AVX2-NEXT: retq
1143-
;
1144-
; AVX512-LABEL: reset_multiload_i128:
1145-
; AVX512: # %bb.0:
1146-
; AVX512-NEXT: movl %esi, %ecx
1147-
; AVX512-NEXT: movl $1, %r8d
1148-
; AVX512-NEXT: xorl %esi, %esi
1149-
; AVX512-NEXT: shldq %cl, %r8, %rsi
1150-
; AVX512-NEXT: xorl %eax, %eax
1151-
; AVX512-NEXT: shlxq %rcx, %r8, %r8
1152-
; AVX512-NEXT: testb $64, %cl
1153-
; AVX512-NEXT: cmovneq %r8, %rsi
1154-
; AVX512-NEXT: cmovneq %rax, %r8
1155-
; AVX512-NEXT: notq %rsi
1156-
; AVX512-NEXT: notq %r8
1157-
; AVX512-NEXT: movl %ecx, %r9d
1158-
; AVX512-NEXT: andl $96, %r9d
1159-
; AVX512-NEXT: shrl $3, %r9d
1160-
; AVX512-NEXT: movl (%rdi,%r9), %r9d
1161-
; AVX512-NEXT: btl %ecx, %r9d
1162-
; AVX512-NEXT: jb .LBB22_2
1163-
; AVX512-NEXT: # %bb.1:
1164-
; AVX512-NEXT: movl (%rdx), %eax
1165-
; AVX512-NEXT: .LBB22_2:
1166-
; AVX512-NEXT: andq %r8, (%rdi)
1167-
; AVX512-NEXT: andq %rsi, 8(%rdi)
1168-
; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
1169-
; AVX512-NEXT: retq
1056+
; X64-LABEL: reset_multiload_i128:
1057+
; X64: # %bb.0:
1058+
; X64-NEXT: movl %esi, %ecx
1059+
; X64-NEXT: andl $96, %ecx
1060+
; X64-NEXT: shrl $3, %ecx
1061+
; X64-NEXT: movl (%rdi,%rcx), %r9d
1062+
; X64-NEXT: movl %r9d, %r8d
1063+
; X64-NEXT: btrl %esi, %r8d
1064+
; X64-NEXT: xorl %eax, %eax
1065+
; X64-NEXT: btl %esi, %r9d
1066+
; X64-NEXT: jb .LBB22_2
1067+
; X64-NEXT: # %bb.1:
1068+
; X64-NEXT: movl (%rdx), %eax
1069+
; X64-NEXT: .LBB22_2:
1070+
; X64-NEXT: movl %r8d, (%rdi,%rcx)
1071+
; X64-NEXT: retq
11701072
%rem = and i32 %position, 127
11711073
%ofs = zext nneg i32 %rem to i128
11721074
%bit = shl nuw i128 1, %ofs

0 commit comments

Comments
 (0)