Skip to content

Commit

Permalink
[X86][SSE] matchShuffleWithPACK - avoid poison pollution from bitcast…
Browse files Browse the repository at this point in the history
…ing multiple elements together.

D106053 exposed that we've not been taking into account that by bitcasting smaller elements together and then performing a ComputeKnownBits on the result we'd be allowing a poison element to influence other neighbouring elements being used in the pack. Instead we now peek through any existing bitcast to ensure that the source type already matches the width source of the pack node we're trying to match.

This has also been a chance to stop matchShuffleWithPACK creating unused nodes on the fly which could affect oneuse tests during shuffle lowering/combining.

The only regression we're seeing is due to being unable to peek through a bitcast as its on the other side of a extract_subvector - which should go away once we finally allow shuffle combining across different vector widths (by making matchShuffleWithPACK using const SelectionDAG& we've gotten closer to this - see PR45974).
  • Loading branch information
RKSimon committed Jul 18, 2021
1 parent 367ec77 commit 51a12d2
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 12 deletions.
33 changes: 22 additions & 11 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -11796,7 +11796,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
// TODO: Add support for matching multiple PACKSS/PACKUS stages.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
unsigned &PackOpcode, ArrayRef<int> TargetMask,
SelectionDAG &DAG,
const SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned MaxStages = 1) {
unsigned NumElts = VT.getVectorNumElements();
Expand All @@ -11807,23 +11807,34 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
unsigned NumSrcBits = PackVT.getScalarSizeInBits();
unsigned NumPackedBits = NumSrcBits - BitSize;
SDValue VV1 = DAG.getBitcast(PackVT, N1);
SDValue VV2 = DAG.getBitcast(PackVT, N2);
N1 = peekThroughBitcasts(N1);
N2 = peekThroughBitcasts(N2);
unsigned NumBits1 = N1.getScalarValueSizeInBits();
unsigned NumBits2 = N2.getScalarValueSizeInBits();
bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
(!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
return false;
if (Subtarget.hasSSE41() || BitSize == 8) {
APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
(N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
V1 = VV1;
V2 = VV2;
if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
(N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
V1 = N1;
V2 = N2;
SrcVT = PackVT;
PackOpcode = X86ISD::PACKUS;
return true;
}
}
if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
(N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
V1 = VV1;
V2 = VV2;
bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
(N2.isUndef() || IsZero2 || IsAllOnes2 ||
DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
V1 = N1;
V2 = N2;
SrcVT = PackVT;
PackOpcode = X86ISD::PACKSS;
return true;
Expand Down
12 changes: 11 additions & 1 deletion llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
Expand Up @@ -754,8 +754,18 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,6,7],zmm1[2,3,6,7]
; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14>
; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[0,1,4,5]
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpshufb %ymm5, %ymm1, %ymm1
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512BW-NEXT: vpshufb %ymm5, %ymm0, %ymm0
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
Expand Down

0 comments on commit 51a12d2

Please sign in to comment.