Skip to content

Commit

Permalink
[X86] Fold BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP…
Browse files Browse the repository at this point in the history
…(Z,W)) (REAPPLIED)

Fold allsignbits pack patterns to make better use of cheap (and commutable) logic ops

Reapplied after a32d14f / 156913c with bitcast fix
  • Loading branch information
RKSimon committed Jul 6, 2023
1 parent bb65e5b commit 3f7470c
Show file tree
Hide file tree
Showing 8 changed files with 394 additions and 333 deletions.
52 changes: 52 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50172,6 +50172,49 @@ static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}

// Attempt to fold:
// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
// TODO: Handle PACKUS handling.
static SDValue combineBitOpWithPACK(SDNode *N, SelectionDAG &DAG) {
unsigned Opc = N->getOpcode();
assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
"Unexpected bit opcode");

SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);

// Both operands must be single use.
if (!N0.hasOneUse() || !N1.hasOneUse())
return SDValue();

// Search for matching packs.
N0 = peekThroughOneUseBitcasts(N0);
N1 = peekThroughOneUseBitcasts(N1);

if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
return SDValue();

MVT DstVT = N0.getSimpleValueType();
if (DstVT != N1.getSimpleValueType())
return SDValue();

MVT SrcVT = N0.getOperand(0).getSimpleValueType();
unsigned NumSrcBits = SrcVT.getScalarSizeInBits();

// Limit to allsignbits packing.
if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
return SDValue();

SDLoc DL(N);
SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
}

/// If this is a zero/all-bits result that is bitwise-anded with a low bits
/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
/// with a shift-right to eliminate loading the vector constant mask value.
Expand Down Expand Up @@ -50596,6 +50639,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineBitOpWithShift(N, DAG))
return R;

if (SDValue R = combineBitOpWithPACK(N, DAG))
return R;

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
return FPLogic;

Expand Down Expand Up @@ -51353,6 +51399,9 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineBitOpWithShift(N, DAG))
return R;

if (SDValue R = combineBitOpWithPACK(N, DAG))
return R;

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
return FPLogic;

Expand Down Expand Up @@ -53889,6 +53938,9 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineBitOpWithShift(N, DAG))
return R;

if (SDValue R = combineBitOpWithPACK(N, DAG))
return R;

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
return FPLogic;

Expand Down
70 changes: 35 additions & 35 deletions llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,13 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double>
define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; SSE2-SSSE3-LABEL: v16i16:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtw %xmm2, %xmm0
; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtw %xmm7, %xmm5
; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtw %xmm6, %xmm4
; SSE2-SSSE3-NEXT: packsswb %xmm5, %xmm4
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4
; SSE2-SSSE3-NEXT: pcmpgtw %xmm7, %xmm5
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5
; SSE2-SSSE3-NEXT: packsswb %xmm5, %xmm4
; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %eax
; SSE2-SSSE3-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-SSSE3-NEXT: retq
Expand Down Expand Up @@ -221,13 +221,13 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
define i8 @v8i32_and(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; SSE2-SSSE3-LABEL: v8i32_and:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm4
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4
; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5
; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm4
; SSE2-SSSE3-NEXT: packsswb %xmm4, %xmm4
; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
Expand Down Expand Up @@ -290,13 +290,13 @@ define i8 @v8i32_and(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
define i8 @v8i32_or(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; SSE2-SSSE3-LABEL: v8i32_or:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm4
; SSE2-SSSE3-NEXT: por %xmm0, %xmm4
; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
; SSE2-SSSE3-NEXT: por %xmm1, %xmm5
; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm4
; SSE2-SSSE3-NEXT: packsswb %xmm4, %xmm4
; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
Expand Down Expand Up @@ -365,17 +365,17 @@ define i8 @v8i32_or_and(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d,
; SSE2-SSSE3-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
; SSE2-SSSE3-NEXT: por %xmm1, %xmm7
; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm6
; SSE2-SSSE3-NEXT: por %xmm0, %xmm6
; SSE2-SSSE3-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm9
; SSE2-SSSE3-NEXT: pand %xmm6, %xmm9
; SSE2-SSSE3-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm8
; SSE2-SSSE3-NEXT: packssdw %xmm9, %xmm8
; SSE2-SSSE3-NEXT: pand %xmm6, %xmm8
; SSE2-SSSE3-NEXT: packsswb %xmm8, %xmm8
; SSE2-SSSE3-NEXT: pmovmskb %xmm8, %eax
; SSE2-SSSE3-NEXT: pand %xmm7, %xmm8
; SSE2-SSSE3-NEXT: packssdw %xmm8, %xmm9
; SSE2-SSSE3-NEXT: packsswb %xmm9, %xmm9
; SSE2-SSSE3-NEXT: pmovmskb %xmm9, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
Expand Down Expand Up @@ -630,13 +630,13 @@ define i8 @v8i32_or_select(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32
define i8 @v8f32_and(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
; SSE2-SSSE3-LABEL: v8f32_and:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: cmpltps %xmm1, %xmm3
; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm2
; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
; SSE2-SSSE3-NEXT: cmpltps %xmm5, %xmm7
; SSE2-SSSE3-NEXT: cmpltps %xmm1, %xmm3
; SSE2-SSSE3-NEXT: cmpltps %xmm4, %xmm6
; SSE2-SSSE3-NEXT: andps %xmm2, %xmm6
; SSE2-SSSE3-NEXT: cmpltps %xmm5, %xmm7
; SSE2-SSSE3-NEXT: andps %xmm3, %xmm7
; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm6
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6
; SSE2-SSSE3-NEXT: packsswb %xmm6, %xmm6
; SSE2-SSSE3-NEXT: pmovmskb %xmm6, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
Expand Down Expand Up @@ -681,13 +681,13 @@ define i8 @v8f32_and(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float>
define i8 @v8f32_xor(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
; SSE2-SSSE3-LABEL: v8f32_xor:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: cmpltps %xmm1, %xmm3
; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm2
; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
; SSE2-SSSE3-NEXT: cmpltps %xmm5, %xmm7
; SSE2-SSSE3-NEXT: cmpltps %xmm1, %xmm3
; SSE2-SSSE3-NEXT: cmpltps %xmm4, %xmm6
; SSE2-SSSE3-NEXT: xorps %xmm2, %xmm6
; SSE2-SSSE3-NEXT: cmpltps %xmm5, %xmm7
; SSE2-SSSE3-NEXT: xorps %xmm3, %xmm7
; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm6
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm6
; SSE2-SSSE3-NEXT: packsswb %xmm6, %xmm6
; SSE2-SSSE3-NEXT: pmovmskb %xmm6, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
Expand Down Expand Up @@ -734,25 +734,25 @@ define i8 @v8f32_xor(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float>
define i8 @v8f32_xor_and(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, <8 x float> %e, <8 x float> %f) {
; SSE2-SSSE3-LABEL: v8f32_xor_and:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
; SSE2-SSSE3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
; SSE2-SSSE3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
; SSE2-SSSE3-NEXT: cmpnleps %xmm3, %xmm1
; SSE2-SSSE3-NEXT: cmpnleps %xmm2, %xmm0
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movaps %xmm5, %xmm1
; SSE2-SSSE3-NEXT: cmpeqps %xmm7, %xmm1
; SSE2-SSSE3-NEXT: movaps %xmm5, %xmm2
; SSE2-SSSE3-NEXT: cmpeqps %xmm7, %xmm2
; SSE2-SSSE3-NEXT: cmpunordps %xmm7, %xmm5
; SSE2-SSSE3-NEXT: orps %xmm1, %xmm5
; SSE2-SSSE3-NEXT: orps %xmm2, %xmm5
; SSE2-SSSE3-NEXT: xorps %xmm1, %xmm5
; SSE2-SSSE3-NEXT: movaps %xmm4, %xmm1
; SSE2-SSSE3-NEXT: cmpeqps %xmm6, %xmm1
; SSE2-SSSE3-NEXT: cmpunordps %xmm6, %xmm4
; SSE2-SSSE3-NEXT: orps %xmm1, %xmm4
; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm4
; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm4
; SSE2-SSSE3-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm9
; SSE2-SSSE3-NEXT: xorps %xmm0, %xmm4
; SSE2-SSSE3-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm8
; SSE2-SSSE3-NEXT: andps %xmm4, %xmm8
; SSE2-SSSE3-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm9
; SSE2-SSSE3-NEXT: andps %xmm5, %xmm9
; SSE2-SSSE3-NEXT: packssdw %xmm9, %xmm8
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm8
; SSE2-SSSE3-NEXT: packsswb %xmm8, %xmm8
; SSE2-SSSE3-NEXT: pmovmskb %xmm8, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
Expand Down
Loading

0 comments on commit 3f7470c

Please sign in to comment.