Skip to content

Commit

Permalink
[X86] Freeze vXi8 shl(x,1) -> add(x,x) vector fold (PR50468)
Browse files Browse the repository at this point in the history
We don't have any vXi8 shift instructions (other than on XOP which is handled separately), so replace the shl(x,1) -> add(x,x) fold with shl(x,1) -> add(freeze(x),freeze(x)) to avoid the undef issues identified in PR50468.

Split off from D106675 as I'm still looking at whether we can fix the vXi16/i32/i64 issues with the D106679 alternative.

Differential Revision: https://reviews.llvm.org/D108139
  • Loading branch information
RKSimon committed Aug 24, 2021
1 parent a643bd3 commit 307890f
Show file tree
Hide file tree
Showing 14 changed files with 476 additions and 469 deletions.
9 changes: 8 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -28731,8 +28731,15 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

// Simple i8 add case
if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
// R may be undef at run-time, but (shl R, 1) must be an even number (LSB
// must be 0). (add undef, undef) however can be any value. To make this
// safe, we must freeze R to ensure that register allocation uses the same
// register for an undefined value. This ensures that the result will
// still be even and preserves the original semantics.
R = DAG.getNode(ISD::FREEZE, dl, VT, R);
return DAG.getNode(ISD::ADD, dl, VT, R, R);
}

// ashr(R, 7) === cmp_slt(R, 0)
if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/bitreverse.ll
Expand Up @@ -69,11 +69,11 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: psrlw $2, %xmm0
; X64-NEXT: por %xmm1, %xmm0
; X64-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
; X64-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
; X64-NEXT: pand %xmm0, %xmm1
; X64-NEXT: paddb %xmm1, %xmm1
; X64-NEXT: psrlw $1, %xmm1
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: psrlw $1, %xmm0
; X64-NEXT: paddb %xmm0, %xmm0
; X64-NEXT: por %xmm1, %xmm0
; X64-NEXT: retq
;
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/combine-bitreverse.ll
Expand Up @@ -61,11 +61,11 @@ define <4 x i32> @test_demandedbits_bitreverse(<4 x i32> %a0) nounwind {
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: psrlw $2, %xmm0
; X86-NEXT: por %xmm1, %xmm0
; X86-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
; X86-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
; X86-NEXT: pand %xmm0, %xmm1
; X86-NEXT: paddb %xmm1, %xmm1
; X86-NEXT: psrlw $1, %xmm1
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: psrlw $1, %xmm0
; X86-NEXT: paddb %xmm0, %xmm0
; X86-NEXT: por %xmm1, %xmm0
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: retl
Expand Down
152 changes: 76 additions & 76 deletions llvm/test/CodeGen/X86/vector-bitreverse.ll

Large diffs are not rendered by default.

26 changes: 13 additions & 13 deletions llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
Expand Up @@ -517,10 +517,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: paddb %xmm2, %xmm3
; SSE2-NEXT: psrlw $7, %xmm3
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: psrlw $7, %xmm4
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
; SSE2-NEXT: paddb %xmm2, %xmm4
; SSE2-NEXT: por %xmm3, %xmm4
; SSE2-NEXT: paddb %xmm1, %xmm1
; SSE2-NEXT: pcmpgtb %xmm1, %xmm0
Expand Down Expand Up @@ -553,10 +553,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: psrlw $7, %xmm0
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: psrlw $7, %xmm3
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE41-NEXT: paddb %xmm1, %xmm3
; SSE41-NEXT: por %xmm0, %xmm3
; SSE41-NEXT: paddb %xmm2, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
Expand All @@ -580,10 +580,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
Expand Down Expand Up @@ -728,10 +728,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; X86-SSE2-NEXT: pandn %xmm3, %xmm2
; X86-SSE2-NEXT: por %xmm4, %xmm2
; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
; X86-SSE2-NEXT: paddb %xmm2, %xmm3
; X86-SSE2-NEXT: psrlw $7, %xmm3
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
; X86-SSE2-NEXT: psrlw $7, %xmm4
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
; X86-SSE2-NEXT: paddb %xmm2, %xmm4
; X86-SSE2-NEXT: por %xmm3, %xmm4
; X86-SSE2-NEXT: paddb %xmm1, %xmm1
; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
Expand Up @@ -443,10 +443,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3
; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
Expand All @@ -467,10 +467,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
Expand Down
192 changes: 96 additions & 96 deletions llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
Expand Up @@ -272,154 +272,154 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
;
; AVX512BW-LABEL: var_funnnel_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm4
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
; AVX512BW-NEXT: vpsllw $5, %zmm3, %zmm3
; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
; AVX512BW-NEXT: vpmovb2m %zmm2, %k2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
; AVX512BW-NEXT: vpsrlw $2, %zmm2, %zmm5
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm5
; AVX512BW-NEXT: vpmovb2m %zmm3, %k2
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm5
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512BW-NEXT: vpsubb %zmm1, %zmm4, %zmm1
; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm3
; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm1
; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2
; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm2
; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm4
; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm3
; AVX512VLBW-NEXT: vpsllw $5, %zmm3, %zmm3
; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k2
; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm2
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
; AVX512VLBW-NEXT: vpsrlw $2, %zmm2, %zmm5
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
; AVX512VLBW-NEXT: vpsrlw $1, %zmm2, %zmm5
; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k2
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm5
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm4, %zmm1
; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm3
; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k1
; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm1
; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm1
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2
; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm4
; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3
; AVX512VBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k2
; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
; AVX512VBMI2-NEXT: vpsrlw $2, %zmm2, %zmm5
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
; AVX512VBMI2-NEXT: vpsrlw $1, %zmm2, %zmm5
; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k2
; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1
; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm4, %zmm1
; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1
; AVX512VBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm3
; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k1
; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2
; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k2
; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm1
; AVX512VBMI2-NEXT: vpsrlw $2, %zmm0, %zmm1
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm1
; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512VBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpsrlw $1, %zmm0, %zmm1
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2
; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm4
; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3
; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k2
; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm2, %zmm5
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm2, %zmm5
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k2
; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1
; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm4, %zmm1
; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1
; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm3
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k1
; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k2
; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm1
; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm0, %zmm1
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm1
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512VLVBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm0, %zmm1
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
ret <64 x i8> %res
Expand Down

0 comments on commit 307890f

Please sign in to comment.