diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll new file mode 100644 index 0000000000000..b40305e2f179c --- /dev/null +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -0,0 +1,381 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+gfni | FileCheck %s --check-prefixes=GFNISSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX512,GFNIAVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX512,GFNIAVX512BW + +; +; 128 Bit Vector Funnel Shifts +; + +define <16 x i8> @splatconstant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; GFNISSE-LABEL: splatconstant_fshl_v16i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psrlw $5, %xmm1 +; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; GFNISSE-NEXT: psllw $3, %xmm0 +; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE-NEXT: por %xmm1, %xmm0 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_fshl_v16i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpsrlw $5, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_fshl_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $5, %xmm1, %xmm1 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_fshl_v16i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm2 +; GFNIAVX512-NEXT: vpsrlw $5, %xmm1, %xmm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; GFNIAVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; GFNIAVX512-NEXT: vzeroupper +; GFNIAVX512-NEXT: retq + %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) + ret <16 x i8> %res +} +declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) + +define <16 x i8> @splatconstant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; GFNISSE-LABEL: splatconstant_fshr_v16i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psrlw $7, %xmm1 +; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; GFNISSE-NEXT: paddb %xmm0, %xmm0 +; GFNISSE-NEXT: por %xmm1, %xmm0 +; GFNISSE-NEXT: retq +; +; GFNIAVX-LABEL: splatconstant_fshr_v16i8: +; GFNIAVX: # %bb.0: +; GFNIAVX-NEXT: vpsrlw $7, %xmm1, %xmm1 +; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; GFNIAVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX-NEXT: retq + %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) + ret <16 x i8> %res +} +declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) + +; +; 256 Bit Vector Funnel Shifts +; + +define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { +; GFNISSE-LABEL: splatconstant_fshl_v32i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psrlw $4, %xmm2 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; GFNISSE-NEXT: movdqa %xmm4, %xmm5 +; GFNISSE-NEXT: pandn %xmm2, %xmm5 +; GFNISSE-NEXT: psllw $4, %xmm0 +; GFNISSE-NEXT: pand %xmm4, %xmm0 +; GFNISSE-NEXT: por %xmm5, %xmm0 +; GFNISSE-NEXT: psrlw $4, %xmm3 +; GFNISSE-NEXT: psllw $4, %xmm1 +; GFNISSE-NEXT: pand %xmm4, %xmm1 +; GFNISSE-NEXT: pandn %xmm3, %xmm4 +; GFNISSE-NEXT: por %xmm4, %xmm1 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_fshl_v32i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; GFNIAVX1-NEXT: vpsllw $4, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_fshl_v32i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpsllw $4, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_fshl_v32i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsllw $4, %ymm0, %ymm2 +; GFNIAVX512-NEXT: vpsrlw $4, %ymm1, %ymm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; GFNIAVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; GFNIAVX512-NEXT: retq + %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> ) + ret <32 x i8> %res +} +declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) + +define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { +; GFNISSE-LABEL: splatconstant_fshr_v32i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psrlw $6, %xmm2 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; GFNISSE-NEXT: movdqa %xmm4, %xmm5 +; GFNISSE-NEXT: pandn %xmm2, %xmm5 +; GFNISSE-NEXT: psllw $2, %xmm0 +; GFNISSE-NEXT: pand %xmm4, %xmm0 +; GFNISSE-NEXT: por %xmm5, %xmm0 +; GFNISSE-NEXT: psrlw $6, %xmm3 +; GFNISSE-NEXT: psllw $2, %xmm1 +; GFNISSE-NEXT: pand %xmm4, %xmm1 +; GFNISSE-NEXT: pandn %xmm3, %xmm4 +; GFNISSE-NEXT: por %xmm4, %xmm1 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_fshr_v32i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $6, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $6, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; GFNIAVX1-NEXT: vpsllw $2, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsllw $2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_fshr_v32i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $6, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpsllw $2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_fshr_v32i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsllw $2, %ymm0, %ymm2 +; GFNIAVX512-NEXT: vpsrlw $6, %ymm1, %ymm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; GFNIAVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; GFNIAVX512-NEXT: retq + %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> ) + ret <32 x i8> %res +} +declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) + +; +; 512 Bit Vector Funnel Shifts +; + +define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { +; GFNISSE-LABEL: splatconstant_fshl_v64i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: movdqa %xmm0, %xmm4 +; GFNISSE-NEXT: psrlw $7, %xmm4 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNISSE-NEXT: pand %xmm5, %xmm4 +; GFNISSE-NEXT: paddb %xmm0, %xmm0 +; GFNISSE-NEXT: por %xmm4, %xmm0 +; GFNISSE-NEXT: movdqa %xmm1, %xmm4 +; GFNISSE-NEXT: psrlw $7, %xmm4 +; GFNISSE-NEXT: pand %xmm5, %xmm4 +; GFNISSE-NEXT: paddb %xmm1, %xmm1 +; GFNISSE-NEXT: por %xmm4, %xmm1 +; GFNISSE-NEXT: movdqa %xmm2, %xmm4 +; GFNISSE-NEXT: psrlw $7, %xmm4 +; GFNISSE-NEXT: pand %xmm5, %xmm4 +; GFNISSE-NEXT: paddb %xmm2, %xmm2 +; GFNISSE-NEXT: por %xmm4, %xmm2 +; GFNISSE-NEXT: movdqa %xmm3, %xmm4 +; GFNISSE-NEXT: psrlw $7, %xmm4 +; GFNISSE-NEXT: pand %xmm5, %xmm4 +; GFNISSE-NEXT: paddb %xmm3, %xmm3 +; GFNISSE-NEXT: por %xmm4, %xmm3 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_fshl_v64i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $7, %xmm0, %xmm3 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $7, %xmm1, %xmm3 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_fshl_v64i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpsrlw $7, %ymm1, %ymm2 +; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512F-LABEL: splatconstant_fshl_v64i8: +; GFNIAVX512F: # %bb.0: +; GFNIAVX512F-NEXT: vpsrlw $7, %ymm0, %ymm1 +; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; GFNIAVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3 +; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; GFNIAVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 +; GFNIAVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; GFNIAVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; GFNIAVX512F-NEXT: retq +; +; GFNIAVX512BW-LABEL: splatconstant_fshl_v64i8: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm1 +; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; GFNIAVX512BW-NEXT: retq + %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> ) + ret <64 x i8> %res +} +declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) + +define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { +; GFNISSE-LABEL: splatconstant_fshr_v64i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psrlw $2, %xmm4 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNISSE-NEXT: movdqa %xmm8, %xmm9 +; GFNISSE-NEXT: pandn %xmm4, %xmm9 +; GFNISSE-NEXT: psllw $6, %xmm0 +; GFNISSE-NEXT: pand %xmm8, %xmm0 +; GFNISSE-NEXT: por %xmm9, %xmm0 +; GFNISSE-NEXT: psrlw $2, %xmm5 +; GFNISSE-NEXT: movdqa %xmm8, %xmm4 +; GFNISSE-NEXT: pandn %xmm5, %xmm4 +; GFNISSE-NEXT: psllw $6, %xmm1 +; GFNISSE-NEXT: pand %xmm8, %xmm1 +; GFNISSE-NEXT: por %xmm4, %xmm1 +; GFNISSE-NEXT: psrlw $2, %xmm6 +; GFNISSE-NEXT: movdqa %xmm8, %xmm4 +; GFNISSE-NEXT: pandn %xmm6, %xmm4 +; GFNISSE-NEXT: psllw $6, %xmm2 +; GFNISSE-NEXT: pand %xmm8, %xmm2 +; GFNISSE-NEXT: por %xmm4, %xmm2 +; GFNISSE-NEXT: psrlw $2, %xmm7 +; GFNISSE-NEXT: psllw $6, %xmm3 +; GFNISSE-NEXT: pand %xmm8, %xmm3 +; GFNISSE-NEXT: pandn %xmm7, %xmm8 +; GFNISSE-NEXT: por %xmm8, %xmm3 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_fshr_v64i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; GFNIAVX1-NEXT: vpsrlw $2, %xmm4, %xmm4 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; GFNIAVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; GFNIAVX1-NEXT: vpsllw $6, %xmm4, %xmm4 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 +; GFNIAVX1-NEXT: vpsllw $6, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; GFNIAVX1-NEXT: vpsllw $6, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpsllw $6, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; GFNIAVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_fshr_v64i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $2, %ymm2, %ymm2 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpsrlw $2, %ymm3, %ymm2 +; GFNIAVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; GFNIAVX2-NEXT: vpsllw $6, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512F-LABEL: splatconstant_fshr_v64i8: +; GFNIAVX512F: # %bb.0: +; GFNIAVX512F-NEXT: vpsllw $6, %ymm0, %ymm2 +; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; GFNIAVX512F-NEXT: vpsllw $6, %ymm0, %ymm0 +; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 +; GFNIAVX512F-NEXT: vpsrlw $2, %ymm1, %ymm0 +; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; GFNIAVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1 +; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; GFNIAVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; GFNIAVX512F-NEXT: retq +; +; GFNIAVX512BW-LABEL: splatconstant_fshr_v64i8: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpsllw $6, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm0 +; GFNIAVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; GFNIAVX512BW-NEXT: retq + %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> ) + ret <64 x i8> %res +} +declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll new file mode 100644 index 0000000000000..7724d9a6e4273 --- /dev/null +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -0,0 +1,383 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+gfni | FileCheck %s --check-prefixes=GFNISSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX512,GFNIAVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX512,GFNIAVX512BW + +; +; 128 Bit Vector Rotates +; + +define <16 x i8> @splatconstant_rotl_v16i8(<16 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_rotl_v16i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: movdqa %xmm0, %xmm1 +; GFNISSE-NEXT: psrlw $5, %xmm1 +; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; GFNISSE-NEXT: psllw $3, %xmm0 +; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE-NEXT: por %xmm1, %xmm0 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_rotl_v16i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpsrlw $5, %xmm0, %xmm1 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_rotl_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $5, %xmm0, %xmm1 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_rotl_v16i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm1 +; GFNIAVX512-NEXT: vpsrlw $5, %xmm0, %xmm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; GFNIAVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; GFNIAVX512-NEXT: vzeroupper +; GFNIAVX512-NEXT: retq + %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %res +} +declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) + +define <16 x i8> @splatconstant_rotr_v16i8(<16 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_rotr_v16i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: movdqa %xmm0, %xmm1 +; GFNISSE-NEXT: psrlw $7, %xmm1 +; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; GFNISSE-NEXT: paddb %xmm0, %xmm0 +; GFNISSE-NEXT: por %xmm1, %xmm0 +; GFNISSE-NEXT: retq +; +; GFNIAVX-LABEL: splatconstant_rotr_v16i8: +; GFNIAVX: # %bb.0: +; GFNIAVX-NEXT: vpsrlw $7, %xmm0, %xmm1 +; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; GFNIAVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX-NEXT: retq + %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %res +} +declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) + +; +; 256 Bit Vector Rotates +; + +define <32 x i8> @splatconstant_rotl_v32i8(<32 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_rotl_v32i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: movdqa %xmm0, %xmm2 +; GFNISSE-NEXT: psrlw $4, %xmm2 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; GFNISSE-NEXT: movdqa %xmm3, %xmm4 +; GFNISSE-NEXT: pandn %xmm2, %xmm4 +; GFNISSE-NEXT: psllw $4, %xmm0 +; GFNISSE-NEXT: pand %xmm3, %xmm0 +; GFNISSE-NEXT: por %xmm4, %xmm0 +; GFNISSE-NEXT: movdqa %xmm1, %xmm2 +; GFNISSE-NEXT: psrlw $4, %xmm2 +; GFNISSE-NEXT: psllw $4, %xmm1 +; GFNISSE-NEXT: pand %xmm3, %xmm1 +; GFNISSE-NEXT: pandn %xmm2, %xmm3 +; GFNISSE-NEXT: por %xmm3, %xmm1 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_rotl_v32i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; GFNIAVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; GFNIAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; GFNIAVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; GFNIAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; GFNIAVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_rotl_v32i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpsllw $4, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_rotl_v32i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsllw $4, %ymm0, %ymm1 +; GFNIAVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; GFNIAVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; GFNIAVX512-NEXT: retq + %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> ) + ret <32 x i8> %res +} +declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) + +define <32 x i8> @splatconstant_rotr_v32i8(<32 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_rotr_v32i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: movdqa %xmm0, %xmm2 +; GFNISSE-NEXT: psrlw $6, %xmm2 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; GFNISSE-NEXT: movdqa %xmm3, %xmm4 +; GFNISSE-NEXT: pandn %xmm2, %xmm4 +; GFNISSE-NEXT: psllw $2, %xmm0 +; GFNISSE-NEXT: pand %xmm3, %xmm0 +; GFNISSE-NEXT: por %xmm4, %xmm0 +; GFNISSE-NEXT: movdqa %xmm1, %xmm2 +; GFNISSE-NEXT: psrlw $6, %xmm2 +; GFNISSE-NEXT: psllw $2, %xmm1 +; GFNISSE-NEXT: pand %xmm3, %xmm1 +; GFNISSE-NEXT: pandn %xmm2, %xmm3 +; GFNISSE-NEXT: por %xmm3, %xmm1 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_rotr_v32i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; GFNIAVX1-NEXT: vpsrlw $6, %xmm1, %xmm2 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; GFNIAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; GFNIAVX1-NEXT: vpsllw $2, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpsrlw $6, %xmm0, %xmm2 +; GFNIAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; GFNIAVX1-NEXT: vpsllw $2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_rotr_v32i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $6, %ymm0, %ymm1 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpsllw $2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_rotr_v32i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsllw $2, %ymm0, %ymm1 +; GFNIAVX512-NEXT: vpsrlw $6, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; GFNIAVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; GFNIAVX512-NEXT: retq + %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> ) + ret <32 x i8> %res +} +declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) + +; +; 512 Bit Vector Rotates +; + +define <64 x i8> @splatconstant_rotl_v64i8(<64 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_rotl_v64i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: movdqa %xmm0, %xmm4 +; GFNISSE-NEXT: psrlw $7, %xmm4 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNISSE-NEXT: pand %xmm5, %xmm4 +; GFNISSE-NEXT: paddb %xmm0, %xmm0 +; GFNISSE-NEXT: por %xmm4, %xmm0 +; GFNISSE-NEXT: movdqa %xmm1, %xmm4 +; GFNISSE-NEXT: psrlw $7, %xmm4 +; GFNISSE-NEXT: pand %xmm5, %xmm4 +; GFNISSE-NEXT: paddb %xmm1, %xmm1 +; GFNISSE-NEXT: por %xmm4, %xmm1 +; GFNISSE-NEXT: movdqa %xmm2, %xmm4 +; GFNISSE-NEXT: psrlw $7, %xmm4 +; GFNISSE-NEXT: pand %xmm5, %xmm4 +; GFNISSE-NEXT: paddb %xmm2, %xmm2 +; GFNISSE-NEXT: por %xmm4, %xmm2 +; GFNISSE-NEXT: movdqa %xmm3, %xmm4 +; GFNISSE-NEXT: psrlw $7, %xmm4 +; GFNISSE-NEXT: pand %xmm5, %xmm4 +; GFNISSE-NEXT: paddb %xmm3, %xmm3 +; GFNISSE-NEXT: por %xmm4, %xmm3 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_rotl_v64i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $7, %xmm0, %xmm3 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $7, %xmm1, %xmm3 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_rotl_v64i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpsrlw $7, %ymm1, %ymm2 +; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512F-LABEL: splatconstant_rotl_v64i8: +; GFNIAVX512F: # %bb.0: +; GFNIAVX512F-NEXT: vpsrlw $7, %ymm0, %ymm1 +; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; GFNIAVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3 +; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; GFNIAVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 +; GFNIAVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; GFNIAVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; GFNIAVX512F-NEXT: retq +; +; GFNIAVX512BW-LABEL: splatconstant_rotl_v64i8: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm1 +; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; GFNIAVX512BW-NEXT: retq + %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> ) + ret <64 x i8> %res +} +declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) + +define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_rotr_v64i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: movdqa %xmm0, %xmm5 +; GFNISSE-NEXT: psrlw $2, %xmm5 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNISSE-NEXT: movdqa %xmm4, %xmm6 +; GFNISSE-NEXT: pandn %xmm5, %xmm6 +; GFNISSE-NEXT: psllw $6, %xmm0 +; GFNISSE-NEXT: pand %xmm4, %xmm0 +; GFNISSE-NEXT: por %xmm6, %xmm0 +; GFNISSE-NEXT: movdqa %xmm1, %xmm5 +; GFNISSE-NEXT: psrlw $2, %xmm5 +; GFNISSE-NEXT: movdqa %xmm4, %xmm6 +; GFNISSE-NEXT: pandn %xmm5, %xmm6 +; GFNISSE-NEXT: psllw $6, %xmm1 +; GFNISSE-NEXT: pand %xmm4, %xmm1 +; GFNISSE-NEXT: por %xmm6, %xmm1 +; GFNISSE-NEXT: movdqa %xmm2, %xmm5 +; GFNISSE-NEXT: psrlw $2, %xmm5 +; GFNISSE-NEXT: movdqa %xmm4, %xmm6 +; GFNISSE-NEXT: pandn %xmm5, %xmm6 +; GFNISSE-NEXT: psllw $6, %xmm2 +; GFNISSE-NEXT: pand %xmm4, %xmm2 +; GFNISSE-NEXT: por %xmm6, %xmm2 +; GFNISSE-NEXT: movdqa %xmm3, %xmm5 +; GFNISSE-NEXT: psrlw $2, %xmm5 +; GFNISSE-NEXT: psllw $6, %xmm3 +; GFNISSE-NEXT: pand %xmm4, %xmm3 +; GFNISSE-NEXT: pandn %xmm5, %xmm4 +; GFNISSE-NEXT: por %xmm4, %xmm3 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_rotr_v64i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 +; GFNIAVX1-NEXT: vpsllw $6, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 +; GFNIAVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 +; GFNIAVX1-NEXT: vpsllw $6, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 +; GFNIAVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 +; GFNIAVX1-NEXT: vpsllw $6, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $2, %xmm1, %xmm3 +; GFNIAVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 +; GFNIAVX1-NEXT: vpsllw $6, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_rotr_v64i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpsrlw $2, %ymm1, %ymm2 +; GFNIAVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; GFNIAVX2-NEXT: vpsllw $6, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512F-LABEL: splatconstant_rotr_v64i8: +; GFNIAVX512F: # %bb.0: +; GFNIAVX512F-NEXT: vpsllw $6, %ymm0, %ymm1 +; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; GFNIAVX512F-NEXT: vpsllw $6, %ymm2, %ymm3 +; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; GFNIAVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0 +; GFNIAVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 +; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; GFNIAVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; GFNIAVX512F-NEXT: retq +; +; GFNIAVX512BW-LABEL: splatconstant_rotr_v64i8: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpsllw $6, %zmm0, %zmm1 +; GFNIAVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; GFNIAVX512BW-NEXT: retq + %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> ) + ret <64 x i8> %res +} +declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll new file mode 100644 index 0000000000000..479631c4e9bf8 --- /dev/null +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -0,0 +1,402 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+gfni | FileCheck %s --check-prefixes=GFNISSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX512,GFNIAVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX512,GFNIAVX512BW + +; +; 128 Bit Vector Shifts +; + +define <16 x i8> @splatconstant_shl_v16i8(<16 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_shl_v16i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psllw $3, %xmm0 +; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE-NEXT: retq +; +; GFNIAVX-LABEL: splatconstant_shl_v16i8: +; GFNIAVX: # %bb.0: +; GFNIAVX-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX-NEXT: retq + %shift = shl <16 x i8> %a, + ret <16 x i8> %shift +} + +define <16 x i8> @splatconstant_lshr_v16i8(<16 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_lshr_v16i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psrlw $7, %xmm0 +; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE-NEXT: retq +; +; GFNIAVX-LABEL: splatconstant_lshr_v16i8: +; GFNIAVX: # %bb.0: +; GFNIAVX-NEXT: vpsrlw $7, %xmm0, %xmm0 +; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX-NEXT: retq + %shift = lshr <16 x i8> %a, + ret <16 x i8> %shift +} + +define <16 x i8> @splatconstant_ashr_v16i8(<16 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_ashr_v16i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psrlw $4, %xmm0 +; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; GFNISSE-NEXT: pxor %xmm1, %xmm0 +; GFNISSE-NEXT: psubb %xmm1, %xmm0 +; GFNISSE-NEXT: retq +; +; GFNIAVX-LABEL: splatconstant_ashr_v16i8: +; GFNIAVX: # %bb.0: +; GFNIAVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; GFNIAVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; GFNIAVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; GFNIAVX-NEXT: retq + %shift = ashr <16 x i8> %a, + ret <16 x i8> %shift +} + +; +; 256 Bit Vector Shifts +; + +define <32 x i8> @splatconstant_shl_v32i8(<32 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_shl_v32i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psllw $6, %xmm0 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNISSE-NEXT: pand %xmm2, %xmm0 +; GFNISSE-NEXT: psllw $6, %xmm1 +; GFNISSE-NEXT: pand %xmm2, %xmm1 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_shl_v32i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; GFNIAVX1-NEXT: vpsllw $6, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpsllw $6, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_shl_v32i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_shl_v32i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsllw $6, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX512-NEXT: retq + %shift = shl <32 x i8> %a, + ret <32 x i8> %shift +} + +define <32 x i8> @splatconstant_lshr_v32i8(<32 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_lshr_v32i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psrlw $1, %xmm0 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNISSE-NEXT: pand %xmm2, %xmm0 +; GFNISSE-NEXT: psrlw $1, %xmm1 +; GFNISSE-NEXT: pand %xmm2, %xmm1 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_lshr_v32i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; GFNIAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_lshr_v32i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_lshr_v32i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsrlw $1, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX512-NEXT: retq + %shift = lshr <32 x i8> %a, + ret <32 x i8> %shift +} + +define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_ashr_v32i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psrlw $2, %xmm0 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; GFNISSE-NEXT: pand %xmm2, %xmm0 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; GFNISSE-NEXT: pxor %xmm3, %xmm0 +; GFNISSE-NEXT: psubb %xmm3, %xmm0 +; GFNISSE-NEXT: psrlw $2, %xmm1 +; GFNISSE-NEXT: pand %xmm2, %xmm1 +; GFNISSE-NEXT: pxor %xmm3, %xmm1 +; GFNISSE-NEXT: psubb %xmm3, %xmm1 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_ashr_v32i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; GFNIAVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; GFNIAVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_ashr_v32i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; GFNIAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_ashr_v32i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsrlw $2, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; GFNIAVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; GFNIAVX512-NEXT: retq + %shift = ashr <32 x i8> %a, + ret <32 x i8> %shift +} + +; +; 512 Bit Vector Shifts +; + +define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_shl_v64i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psllw $5, %xmm0 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; GFNISSE-NEXT: pand %xmm4, %xmm0 +; GFNISSE-NEXT: psllw $5, %xmm1 +; GFNISSE-NEXT: pand %xmm4, %xmm1 +; GFNISSE-NEXT: psllw $5, %xmm2 +; GFNISSE-NEXT: pand %xmm4, %xmm2 +; GFNISSE-NEXT: psllw $5, %xmm3 +; GFNISSE-NEXT: pand %xmm4, %xmm3 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_shl_v64i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsllw $5, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_shl_v64i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsllw $5, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512F-LABEL: splatconstant_shl_v64i8: +; GFNIAVX512F: # %bb.0: +; GFNIAVX512F-NEXT: vpsllw $5, %ymm0, %ymm1 +; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; GFNIAVX512F-NEXT: vpsllw $5, %ymm0, %ymm0 +; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; GFNIAVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512F-NEXT: retq +; +; GFNIAVX512BW-LABEL: splatconstant_shl_v64i8: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpsllw $5, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: retq + %shift = shl <64 x i8> %a, + ret <64 x i8> %shift +} + +define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_lshr_v64i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psrlw $7, %xmm0 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNISSE-NEXT: pand %xmm4, %xmm0 +; GFNISSE-NEXT: psrlw $7, %xmm1 +; GFNISSE-NEXT: pand %xmm4, %xmm1 +; GFNISSE-NEXT: psrlw $7, %xmm2 +; GFNISSE-NEXT: pand %xmm4, %xmm2 +; GFNISSE-NEXT: psrlw $7, %xmm3 +; GFNISSE-NEXT: pand %xmm4, %xmm3 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_lshr_v64i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_lshr_v64i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpsrlw $7, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512F-LABEL: splatconstant_lshr_v64i8: +; GFNIAVX512F: # %bb.0: +; GFNIAVX512F-NEXT: vpsrlw $7, %ymm0, %ymm1 +; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; GFNIAVX512F-NEXT: vpsrlw $7, %ymm0, %ymm0 +; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; GFNIAVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512F-NEXT: retq +; +; GFNIAVX512BW-LABEL: splatconstant_lshr_v64i8: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: retq + %shift = lshr <64 x i8> %a, + ret <64 x i8> %shift +} + +define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind { +; GFNISSE-LABEL: splatconstant_ashr_v64i8: +; GFNISSE: # %bb.0: +; GFNISSE-NEXT: psrlw $1, %xmm0 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNISSE-NEXT: pand %xmm4, %xmm0 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; GFNISSE-NEXT: pxor %xmm5, %xmm0 +; GFNISSE-NEXT: psubb %xmm5, %xmm0 +; GFNISSE-NEXT: psrlw $1, %xmm1 +; GFNISSE-NEXT: pand %xmm4, %xmm1 +; GFNISSE-NEXT: pxor %xmm5, %xmm1 +; GFNISSE-NEXT: psubb %xmm5, %xmm1 +; GFNISSE-NEXT: psrlw $1, %xmm2 +; GFNISSE-NEXT: pand %xmm4, %xmm2 +; GFNISSE-NEXT: pxor %xmm5, %xmm2 +; GFNISSE-NEXT: psubb %xmm5, %xmm2 +; GFNISSE-NEXT: psrlw $1, %xmm3 +; GFNISSE-NEXT: pand %xmm4, %xmm3 +; GFNISSE-NEXT: pxor %xmm5, %xmm3 +; GFNISSE-NEXT: psubb %xmm5, %xmm3 +; GFNISSE-NEXT: retq +; +; GFNIAVX1-LABEL: splatconstant_ashr_v64i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; GFNIAVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpsubb %xmm4, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_ashr_v64i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; GFNIAVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512F-LABEL: splatconstant_ashr_v64i8: +; GFNIAVX512F: # %bb.0: +; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; GFNIAVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNIAVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; GFNIAVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; GFNIAVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; GFNIAVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; GFNIAVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; GFNIAVX512F-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; GFNIAVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; GFNIAVX512F-NEXT: retq +; +; GFNIAVX512BW-LABEL: splatconstant_ashr_v64i8: +; GFNIAVX512BW: # %bb.0: +; GFNIAVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; GFNIAVX512BW-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; GFNIAVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: retq + %shift = ashr <64 x i8> %a, + ret <64 x i8> %shift +}