Skip to content

Commit

Permalink
[X86] Add custom type legalization for v16i64->v16i8 truncate and v8i…
Browse files Browse the repository at this point in the history
…64->v8i8 truncate when v8i64 isn't legal

Summary:
The default legalization for v16i64->v16i8 tries to create a multiple stage truncate concatenating after each stage and truncating again. But avx512 implements truncates with multiple uops. So it should be better to truncate all the way to the desired element size and then concatenate the pieces using unpckl instructions. This minimizes the number of 2 uop truncates. The unpcks are all single uop instructions.

I tried to handle this by just custom splitting the v16i64->v16i8 shuffle. And hoped that the DAG combiner would leave the two halves in the state needed to make D68374 do the job for each half. This worked for the first half, but the second half got messed up. So I've implemented custom handling for v8i64->v8i8 when v8i64 needs to be split to produce the VTRUNCs directly.

Reviewers: RKSimon, spatel

Reviewed By: RKSimon

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D68428

llvm-svn: 373864
  • Loading branch information
topperc committed Oct 6, 2019
1 parent 842dde6 commit 570ae49
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 68 deletions.
26 changes: 23 additions & 3 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -1763,6 +1763,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
}

// We want to custom lower some of our intrinsics.
Expand Down Expand Up @@ -19329,9 +19330,11 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");

// If called by the legalizer just return.
if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) {
if ((InVT == MVT::v8i64 || InVT == MVT::v16i32) && VT.is128BitVector()) {
// If we're called by the type legalizer, handle a few cases.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isTypeLegal(InVT)) {
if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
VT.is128BitVector()) {
assert(Subtarget.hasVLX() && "Unexpected subtarget!");
// The default behavior is to truncate one step, concatenate, and then
// truncate the remainder. We'd rather produce two 64-bit results and
Expand Down Expand Up @@ -27958,6 +27961,23 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
}
if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
isTypeLegal(MVT::v4i64)) {
// Input needs to be split and output needs to widened. Let's use two
// VTRUNCs, and shuffle their results together into the wider type.
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(In, dl);

Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
{ 0, 1, 2, 3, 16, 17, 18, 19,
-1, -1, -1, -1, -1, -1, -1, -1 });
Results.push_back(Res);
return;
}

return;
}
case ISD::ANY_EXTEND:
Expand Down
41 changes: 15 additions & 26 deletions llvm/test/CodeGen/X86/min-legal-vector-width.ll
Expand Up @@ -797,14 +797,12 @@ define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %x) nounwind "min-legal-vector-
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2
; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3
; CHECK-NEXT: vpmovqd %ymm2, %xmm2
; CHECK-NEXT: vpmovqd %ymm3, %xmm3
; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; CHECK-NEXT: vpmovdb %ymm2, %xmm2
; CHECK-NEXT: vpmovqd %ymm0, %xmm0
; CHECK-NEXT: vpmovqd %ymm1, %xmm1
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vpmovdb %ymm0, %xmm0
; CHECK-NEXT: vpmovqb %ymm3, %xmm3
; CHECK-NEXT: vpmovqb %ymm2, %xmm2
; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-NEXT: vpmovqb %ymm1, %xmm1
; CHECK-NEXT: vpmovqb %ymm0, %xmm0
; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
Expand All @@ -829,24 +827,15 @@ define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector-
}

define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
; CHECK-AVX512-LABEL: trunc_v8i64_v8i8:
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0
; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-AVX512-NEXT: vpmovqb %ymm1, %xmm1
; CHECK-AVX512-NEXT: vpmovqb %ymm0, %xmm0
; CHECK-AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-AVX512-NEXT: vzeroupper
; CHECK-AVX512-NEXT: retq
;
; CHECK-VBMI-LABEL: trunc_v8i64_v8i8:
; CHECK-VBMI: # %bb.0:
; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-VBMI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
; CHECK-VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-VBMI-NEXT: vzeroupper
; CHECK-VBMI-NEXT: retq
; CHECK-LABEL: trunc_v8i64_v8i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm0
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-NEXT: vpmovqb %ymm1, %xmm1
; CHECK-NEXT: vpmovqb %ymm0, %xmm0
; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%a = load <8 x i64>, <8 x i64>* %x
%b = trunc <8 x i64> %a to <8 x i8>
ret <8 x i8> %b
Expand Down
65 changes: 51 additions & 14 deletions llvm/test/CodeGen/X86/vector-trunc-packus.ll
Expand Up @@ -2732,20 +2732,57 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_packus_v16i64_v16i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX512F-LABEL: trunc_packus_v16i64_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpminsq %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_packus_v16i64_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1
; AVX512VL-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_packus_v16i64_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpminsq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpminsq %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_packus_v16i64_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1
; AVX512BWVL-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp slt <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%3 = icmp sgt <16 x i64> %2, zeroinitializer
Expand Down
59 changes: 45 additions & 14 deletions llvm/test/CodeGen/X86/vector-trunc-ssat.ll
Expand Up @@ -2717,20 +2717,51 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_ssat_v16i64_v16i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127]
; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX512F-LABEL: trunc_ssat_v16i64_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpminsq %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_ssat_v16i64_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovsqb %zmm1, %xmm1
; AVX512VL-NEXT: vpmovsqb %zmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_ssat_v16i64_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127]
; AVX512BW-NEXT: vpminsq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpminsq %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_ssat_v16i64_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovsqb %zmm1, %xmm1
; AVX512BWVL-NEXT: vpmovsqb %zmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp slt <16 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
%2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
%3 = icmp sgt <16 x i64> %2, <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
Expand Down
50 changes: 39 additions & 11 deletions llvm/test/CodeGen/X86/vector-trunc-usat.ll
Expand Up @@ -1842,17 +1842,45 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_usat_v16i64_v16i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
; AVX512-NEXT: vpminuq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpminuq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX512F-LABEL: trunc_usat_v16i64_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpminuq %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_usat_v16i64_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1
; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_usat_v16i64_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpminuq %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpminuq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_usat_v16i64_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1
; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp ult <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%3 = trunc <16 x i64> %2 to <16 x i8>
Expand Down

0 comments on commit 570ae49

Please sign in to comment.