From f6182e3b1b5e6549225512ed6a52a9ac4af1368d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 3 Sep 2025 14:00:11 +0100 Subject: [PATCH] [X86] Only fold AND/ANDNP back to VSELECT if we know the predicated mask select is legal By only checking type legality we didn't account for 128/256-bit ops being run on non-AVX512VL targets, or vXi8/i16 ops being run on non-AVX512BW targets This check is cropping up in several places now and I intend to hoist it out into a common helper, but this initial fix needs to be as clean as possible to be back ported to 21.X Fixes #156256 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++++ llvm/test/CodeGen/X86/pr156256.ll | 25 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 llvm/test/CodeGen/X86/pr156256.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 572cfdad3c93b..47cea933d0836 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -51856,6 +51856,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, SDValue X, Y; EVT CondVT = VT.changeVectorElementType(MVT::i1); if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) && + (VT.is512BitVector() || Subtarget.hasVLX()) && + (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && sd_match(N, m_And(m_Value(X), m_OneUse(m_SExt(m_AllOf( m_Value(Y), m_SpecificVT(CondVT), @@ -55420,6 +55422,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, SDValue Src = N0.getOperand(0); EVT SrcVT = Src.getValueType(); if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 && + (VT.is512BitVector() || Subtarget.hasVLX()) && + (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse()) return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1, getZeroVector(VT, Subtarget, DAG, DL)); diff --git a/llvm/test/CodeGen/X86/pr156256.ll b/llvm/test/CodeGen/X86/pr156256.ll new file mode 100644 index 0000000000000..13caa6fee5878 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr156256.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=AVX512VL + +define <16 x i16> @PR156256(<16 x i32> %a, <16 x i32> %b) { +; AVX512-LABEL: PR156256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: PR156256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; AVX512VL-NEXT: vpmovm2d %k0, %zmm0 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: retq + %icmp = icmp ugt <16 x i32> %a, %b + %sext = sext <16 x i1> %icmp to <16 x i16> + %and = and <16 x i16> %sext, splat (i16 16256) + ret <16 x i16> %and +}