Skip to content

Commit

Permalink
[X86] Freeze shl(x,1) -> add(x,x) vector fold (PR50468)
Browse files Browse the repository at this point in the history
Vector fold shl(x,1) -> add(freeze(x),freeze(x)) to avoid the undef issues identified in PR50468

Differential Revision: https://reviews.llvm.org/D106675
  • Loading branch information
RKSimon committed Aug 15, 2022
1 parent f7f5308 commit a7b85e4
Show file tree
Hide file tree
Showing 16 changed files with 472 additions and 501 deletions.
30 changes: 15 additions & 15 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29735,8 +29735,22 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,

uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
// Hardware support for vector shifts is sparse which makes us scalarize the
// vector operations in many cases. Also, on sandybridge ADD is faster than
// shl: (shl V, 1) -> (add (freeze V), (freeze V))
if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
// R may be undef at run-time, but (shl R, 1) must be an even number (LSB
// must be 0). (add undef, undef) however can be any value. To make this
// safe, we must freeze R to ensure that register allocation uses the same
// register for an undefined value. This ensures that the result will
// still be even and preserves the original semantics.
R = DAG.getFreeze(R);
return DAG.getNode(ISD::ADD, dl, VT, R, R);
}

return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
}

// i64 SRA needs to be performed as partial shifts.
if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
Expand Down Expand Up @@ -46674,20 +46688,6 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
}
}

// Hardware support for vector shifts is sparse which makes us scalarize the
// vector operations in many cases. Also, on sandybridge ADD is faster than
// shl.
// (shl V, 1) -> add V,V
if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
assert(N0.getValueType().isVector() && "Invalid vector shift type");
// We shift all of the values by one. In many cases we do not have
// hardware support for this operation. This is better expressed as an ADD
// of two values.
if (N1SplatC->isOne())
return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
}

return SDValue();
}

Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/combine-mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,13 @@ define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
; SSE-LABEL: combine_vec_mul_pow2c:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psllq $1, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psllq $4, %xmm2
; SSE-NEXT: psllq $2, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: paddq %xmm0, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_pow2c:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/freeze-binary.ll
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ define <2 x i64> @freeze_shl_vec(<2 x i64> %a0) nounwind {
define <2 x i64> @freeze_shl_vec_outofrange(<2 x i64> %a0) nounwind {
; X86-LABEL: freeze_shl_vec_outofrange:
; X86: # %bb.0:
; X86-NEXT: psllq $1, %xmm0
; X86-NEXT: paddq %xmm0, %xmm0
; X86-NEXT: psllq $2, %xmm0
; X86-NEXT: retl
;
Expand Down
56 changes: 28 additions & 28 deletions llvm/test/CodeGen/X86/oddsubvector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -157,71 +157,71 @@ define void @PR42833() {
; SSE2-LABEL: PR42833:
; SSE2: # %bb.0:
; SSE2-NEXT: movl b(%rip), %eax
; SSE2-NEXT: movdqa c+144(%rip), %xmm0
; SSE2-NEXT: movdqa c+128(%rip), %xmm1
; SSE2-NEXT: movdqa c+128(%rip), %xmm0
; SSE2-NEXT: movdqa c+144(%rip), %xmm1
; SSE2-NEXT: addl c+128(%rip), %eax
; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: paddd %xmm1, %xmm3
; SSE2-NEXT: paddd %xmm0, %xmm3
; SSE2-NEXT: movdqa d+144(%rip), %xmm4
; SSE2-NEXT: psubd %xmm0, %xmm4
; SSE2-NEXT: paddd %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: paddd %xmm1, %xmm5
; SSE2-NEXT: psubd %xmm1, %xmm4
; SSE2-NEXT: paddd %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: paddd %xmm0, %xmm5
; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
; SSE2-NEXT: movdqa %xmm0, c+144(%rip)
; SSE2-NEXT: movdqa %xmm1, c+144(%rip)
; SSE2-NEXT: movaps %xmm5, c+128(%rip)
; SSE2-NEXT: movdqa c+160(%rip), %xmm0
; SSE2-NEXT: movdqa c+160(%rip), %xmm1
; SSE2-NEXT: movdqa c+176(%rip), %xmm3
; SSE2-NEXT: movdqa d+160(%rip), %xmm5
; SSE2-NEXT: movdqa d+176(%rip), %xmm6
; SSE2-NEXT: movdqa d+128(%rip), %xmm7
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
; SSE2-NEXT: psubd %xmm1, %xmm7
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; SSE2-NEXT: psubd %xmm0, %xmm7
; SSE2-NEXT: psubd %xmm3, %xmm6
; SSE2-NEXT: psubd %xmm0, %xmm5
; SSE2-NEXT: psubd %xmm1, %xmm5
; SSE2-NEXT: movdqa %xmm5, d+160(%rip)
; SSE2-NEXT: movdqa %xmm6, d+176(%rip)
; SSE2-NEXT: movdqa %xmm4, d+144(%rip)
; SSE2-NEXT: movdqa %xmm7, d+128(%rip)
; SSE2-NEXT: paddd %xmm3, %xmm3
; SSE2-NEXT: paddd %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, c+160(%rip)
; SSE2-NEXT: paddd %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm1, c+160(%rip)
; SSE2-NEXT: movdqa %xmm3, c+176(%rip)
; SSE2-NEXT: retq
;
; SSE42-LABEL: PR42833:
; SSE42: # %bb.0:
; SSE42-NEXT: movl b(%rip), %eax
; SSE42-NEXT: movdqa c+144(%rip), %xmm0
; SSE42-NEXT: movdqa c+128(%rip), %xmm1
; SSE42-NEXT: movdqa c+128(%rip), %xmm0
; SSE42-NEXT: movdqa c+144(%rip), %xmm1
; SSE42-NEXT: addl c+128(%rip), %eax
; SSE42-NEXT: movd %eax, %xmm2
; SSE42-NEXT: paddd %xmm1, %xmm2
; SSE42-NEXT: paddd %xmm0, %xmm2
; SSE42-NEXT: movdqa d+144(%rip), %xmm3
; SSE42-NEXT: psubd %xmm0, %xmm3
; SSE42-NEXT: paddd %xmm0, %xmm0
; SSE42-NEXT: movdqa %xmm1, %xmm4
; SSE42-NEXT: paddd %xmm1, %xmm4
; SSE42-NEXT: psubd %xmm1, %xmm3
; SSE42-NEXT: paddd %xmm1, %xmm1
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: paddd %xmm0, %xmm4
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
; SSE42-NEXT: movdqa %xmm0, c+144(%rip)
; SSE42-NEXT: movdqa %xmm1, c+144(%rip)
; SSE42-NEXT: movdqa %xmm4, c+128(%rip)
; SSE42-NEXT: movdqa c+160(%rip), %xmm0
; SSE42-NEXT: movdqa c+160(%rip), %xmm1
; SSE42-NEXT: movdqa c+176(%rip), %xmm2
; SSE42-NEXT: movdqa d+160(%rip), %xmm4
; SSE42-NEXT: movdqa d+176(%rip), %xmm5
; SSE42-NEXT: movdqa d+128(%rip), %xmm6
; SSE42-NEXT: pinsrd $0, %eax, %xmm1
; SSE42-NEXT: psubd %xmm1, %xmm6
; SSE42-NEXT: pinsrd $0, %eax, %xmm0
; SSE42-NEXT: psubd %xmm0, %xmm6
; SSE42-NEXT: psubd %xmm2, %xmm5
; SSE42-NEXT: psubd %xmm0, %xmm4
; SSE42-NEXT: psubd %xmm1, %xmm4
; SSE42-NEXT: movdqa %xmm4, d+160(%rip)
; SSE42-NEXT: movdqa %xmm5, d+176(%rip)
; SSE42-NEXT: movdqa %xmm3, d+144(%rip)
; SSE42-NEXT: movdqa %xmm6, d+128(%rip)
; SSE42-NEXT: paddd %xmm2, %xmm2
; SSE42-NEXT: paddd %xmm0, %xmm0
; SSE42-NEXT: movdqa %xmm0, c+160(%rip)
; SSE42-NEXT: paddd %xmm1, %xmm1
; SSE42-NEXT: movdqa %xmm1, c+160(%rip)
; SSE42-NEXT: movdqa %xmm2, c+176(%rip)
; SSE42-NEXT: retq
;
Expand Down
6 changes: 0 additions & 6 deletions llvm/test/CodeGen/X86/rotate_vec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -111,21 +111,18 @@ define <4 x i32> @rot_v4i32_mask_ashr0(<4 x i32> %a0) {
; XOPAVX1-LABEL: rot_v4i32_mask_ashr0:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: rot_v4i32_mask_ashr0:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: rot_v4i32_mask_ashr0:
; AVX512: # %bb.0:
; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = ashr <4 x i32> %a0, <i32 25, i32 26, i32 27, i32 28>
Expand All @@ -139,23 +136,20 @@ define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) {
; XOPAVX1-LABEL: rot_v4i32_mask_ashr1:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm0
; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: rot_v4i32_mask_ashr1:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrad $25, %xmm0, %xmm0
; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: rot_v4i32_mask_ashr1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpsrad $25, %xmm0, %xmm0
; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
Expand Down
Loading

0 comments on commit a7b85e4

Please sign in to comment.