Skip to content

Commit

Permalink
[AVX-512] Add support for creating SIGN_EXTEND_VECTOR_INREG and ZERO_…
Browse files Browse the repository at this point in the history
…EXTEND_VECTOR_INREG for 512-bit vectors to support vpmovzxbq and vpmovsxbq.

Summary: The one tricky thing about this is that the sign/zero_extend_inreg uses v64i8 as an input type which isn't legal without BWI support. Though the vpmovsxbq and vpmovzxbq instructions themselves don't require BWI. To support this we need to add custom lowering for ZERO_EXTEND_VECTOR_INREG with v64i8 input. This can mostly reuse the existing sign extend code with a couple checks for sign extend vs zero extend added.

Reviewers: delena, RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D25594

llvm-svn: 285053
  • Loading branch information
topperc committed Oct 25, 2016
1 parent 4f3b2df commit 01e4667
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 38 deletions.
50 changes: 39 additions & 11 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1309,6 +1309,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, VT, Legal);
}

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);

// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);

setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
Expand Down Expand Up @@ -1509,6 +1516,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
setOperationAction(ISD::UMIN, MVT::v32i16, Legal);

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);

setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
if (Subtarget.hasVLX()) {
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
Expand Down Expand Up @@ -16368,9 +16377,13 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
}

static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
// For sign extend this needs to handle all vector sizes and SSE4.1 and
// non-SSE4.1 targets. For zero extend this should only handle inputs of
// MVT::v64i8 when BWI is not supported, but AVX512 is.
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = Op->getOperand(0);
MVT VT = Op->getSimpleValueType(0);
MVT InVT = In.getSimpleValueType();
Expand All @@ -16385,20 +16398,33 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
return SDValue();
if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
!(VT.is256BitVector() && Subtarget.hasInt256()))
!(VT.is256BitVector() && Subtarget.hasInt256()) &&
!(VT.is512BitVector() && Subtarget.hasAVX512()))
return SDValue();

SDLoc dl(Op);

// For 256-bit vectors, we only need the lower (128-bit) half of the input.
if (VT.is256BitVector())
In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2),
In, DAG.getIntPtrConstant(0, dl));
// For 512-bit vectors, we need 128-bits or 256-bits.
if (VT.getSizeInBits() > 128) {
// Input needs to be at least the same number of elements as output, and
// at least 128-bits.
int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
}

assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");

// SSE41 targets can use the pmovsx* instructions directly.
unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
X86ISD::VSEXT : X86ISD::VZEXT;
if (Subtarget.hasSSE41())
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
return DAG.getNode(ExtOpc, dl, VT, In);

// We should only get here for sign extend.
assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
"Unexpected opcode!");

// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
SDValue Curr = In;
Expand Down Expand Up @@ -22075,8 +22101,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
case ISD::ZERO_EXTEND_VECTOR_INREG:
case ISD::SIGN_EXTEND_VECTOR_INREG:
return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
Expand Down Expand Up @@ -31120,7 +31147,8 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
// ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
// Also use this if we don't have SSE41 to allow the legalizer do its job.
if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
(VT.is256BitVector() && Subtarget.hasInt256())) {
(VT.is256BitVector() && Subtarget.hasInt256()) ||
(VT.is512BitVector() && Subtarget.hasAVX512())) {
SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
return Opcode == ISD::SIGN_EXTEND
? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
Expand Down
24 changes: 6 additions & 18 deletions llvm/test/CodeGen/X86/avx512-pmovxrm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,12 @@ define <8 x i64> @test_llvm_x86_avx512_pmovsxbq(<16 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovsxbq:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: vpsllq $56, %zmm0, %zmm0
; X32-NEXT: vpsraq $56, %zmm0, %zmm0
; X32-NEXT: vpmovsxbq (%eax), %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovsxbq:
; X64: ## BB#0:
; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: vpsllq $56, %zmm0, %zmm0
; X64-NEXT: vpsraq $56, %zmm0, %zmm0
; X64-NEXT: vpmovsxbq (%rdi), %zmm0
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Expand Down Expand Up @@ -139,22 +135,14 @@ define <8 x i64> @test_llvm_x86_avx512_pmovzxbq(<16 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovzxbq:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; X32-NEXT: vpand %ymm2, %ymm1, %ymm1
; X32-NEXT: vpand %ymm2, %ymm0, %ymm0
; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; X32-NEXT: vmovdqu (%eax), %xmm0
; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovzxbq:
; X64: ## BB#0:
; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; X64-NEXT: vpand %ymm2, %ymm1, %ymm1
; X64-NEXT: vpand %ymm2, %ymm0, %ymm0
; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; X64-NEXT: vmovdqu (%rdi), %xmm0
; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Expand Down
4 changes: 1 addition & 3 deletions llvm/test/CodeGen/X86/vector-sext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -599,9 +599,7 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
;
; AVX512-LABEL: sext_16i8_to_8i64:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vpsllq $56, %zmm0, %zmm0
; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0
; AVX512-NEXT: vpmovsxbq %xmm0, %zmm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_16i8_to_8i64:
Expand Down
7 changes: 1 addition & 6 deletions llvm/test/CodeGen/X86/vector-zext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -461,17 +461,12 @@ define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
; AVX512F-LABEL: zext_16i8_to_8i64:
; AVX512F: # BB#0: # %entry
; AVX512F-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: zext_16i8_to_8i64:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Expand Down

0 comments on commit 01e4667

Please sign in to comment.