diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll new file mode 100644 index 0000000000000..1fac1a0490954 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -0,0 +1,29220 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-SLOW,FALLBACK0 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-FAST,FALLBACK1 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-SLOW,FALLBACK2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-FAST,FALLBACK3 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-SLOW,FALLBACK4 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-FAST,FALLBACK5 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-SLOW,FALLBACK6 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-FAST,FALLBACK7 + +define void @mask_replication_factor2_vf2(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor2_vf2: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512F-ONLY-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512F-ONLY-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, %eax +; AVX512F-ONLY-NEXT: movb %al, (%rsi) +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor2_vf2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512DQ-NEXT: vpmovd2m %xmm0, %k0 +; AVX512DQ-NEXT: kmovb %k0, (%rsi) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor2_vf2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k1 +; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: movb %al, (%rsi) +; AVX512BW-NEXT: retq + %src.vec = load <2 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <4 x i32> + store <4 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor2_vf4(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-SLOW-LABEL: mask_replication_factor2_vf4: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 +; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512F-SLOW-NEXT: kmovw %k0, %eax +; AVX512F-SLOW-NEXT: movb %al, (%rsi) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: mask_replication_factor2_vf4: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 +; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512F-FAST-NEXT: kmovw %k0, %eax +; AVX512F-FAST-NEXT: movb %al, (%rsi) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: mask_replication_factor2_vf4: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k0 +; AVX512DQ-SLOW-NEXT: kmovb %k0, (%rsi) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: mask_replication_factor2_vf4: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k0 +; AVX512DQ-FAST-NEXT: kmovb %k0, (%rsi) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: mask_replication_factor2_vf4: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: kmovw (%rdi), %k1 +; AVX512BW-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512BW-SLOW-NEXT: kmovd %k0, %eax +; AVX512BW-SLOW-NEXT: movb %al, (%rsi) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: mask_replication_factor2_vf4: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: kmovw (%rdi), %k1 +; AVX512BW-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512BW-FAST-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512BW-FAST-NEXT: kmovd %k0, %eax +; AVX512BW-FAST-NEXT: movb %al, (%rsi) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq +; +; AVX512VBMI-SLOW-LABEL: mask_replication_factor2_vf4: +; AVX512VBMI-SLOW: # %bb.0: +; AVX512VBMI-SLOW-NEXT: kmovw (%rdi), %k1 +; AVX512VBMI-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VBMI-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VBMI-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VBMI-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512VBMI-SLOW-NEXT: kmovd %k0, %eax +; AVX512VBMI-SLOW-NEXT: movb %al, (%rsi) +; AVX512VBMI-SLOW-NEXT: vzeroupper +; AVX512VBMI-SLOW-NEXT: retq +; +; AVX512VBMI-FAST-LABEL: mask_replication_factor2_vf4: +; AVX512VBMI-FAST: # %bb.0: +; AVX512VBMI-FAST-NEXT: kmovw (%rdi), %k1 +; AVX512VBMI-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VBMI-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX512VBMI-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VBMI-FAST-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512VBMI-FAST-NEXT: kmovd %k0, %eax +; AVX512VBMI-FAST-NEXT: movb %al, (%rsi) +; AVX512VBMI-FAST-NEXT: vzeroupper +; AVX512VBMI-FAST-NEXT: retq + %src.vec = load <4 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <8 x i32> + store <8 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor2_vf8(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor2_vf8: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, (%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor2_vf8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor2_vf8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovw %k0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %src.vec = load <8 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <16 x i32> + store <16 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor2_vf16(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor2_vf16: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, 2(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k0, (%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor2_vf16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: kmovw %k1, 2(%rsi) +; AVX512DQ-NEXT: kmovw %k0, (%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor2_vf16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %src.vec = load <16 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <32 x i32> + store <32 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor2_vf32(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor2_vf32: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 +; AVX512F-ONLY-NEXT: kmovw %k3, 4(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k2, 6(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k1, (%rsi) +; AVX512F-ONLY-NEXT: kmovw %k0, 2(%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor2_vf32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 +; AVX512DQ-NEXT: kmovw %k3, 4(%rsi) +; AVX512DQ-NEXT: kmovw %k1, 6(%rsi) +; AVX512DQ-NEXT: kmovw %k2, (%rsi) +; AVX512DQ-NEXT: kmovw %k0, 2(%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k0 +; AVX512BW-ONLY-NEXT: kmovq %k0, (%rsi) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k0 +; AVX512VBMI-ONLY-NEXT: kmovq %k0, (%rsi) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq + %src.vec = load <32 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <64 x i32> + store <64 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor2_vf64(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor2_vf64: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k3 +; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k4 +; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3 +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k3 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3 +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k4 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 +; AVX512F-ONLY-NEXT: kmovw %k7, 12(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k2, 14(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k6, 8(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k4, 10(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k5, 4(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k3, 6(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k1, (%rsi) +; AVX512F-ONLY-NEXT: kmovw %k0, 2(%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor2_vf64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k3 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k4 +; AVX512DQ-NEXT: kmovw 6(%rdi), %k2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k3 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k5 +; AVX512DQ-NEXT: vpmovm2d %k4, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k4 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 +; AVX512DQ-NEXT: vpmovm2d %k2, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7 +; AVX512DQ-NEXT: kmovw %k7, 12(%rsi) +; AVX512DQ-NEXT: kmovw %k2, 14(%rsi) +; AVX512DQ-NEXT: kmovw %k6, 8(%rsi) +; AVX512DQ-NEXT: kmovw %k4, 10(%rsi) +; AVX512DQ-NEXT: kmovw %k5, 4(%rsi) +; AVX512DQ-NEXT: kmovw %k3, 6(%rsi) +; AVX512DQ-NEXT: kmovw %k1, (%rsi) +; AVX512DQ-NEXT: kmovw %k0, 2(%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf64: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,2,3,2,3] +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5,6,7,6,7] +; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: kmovq %k1, 8(%rsi) +; AVX512BW-ONLY-NEXT: kmovq %k0, (%rsi) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf64: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: kmovq %k1, 8(%rsi) +; AVX512VBMI-ONLY-NEXT: kmovq %k0, (%rsi) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq + %src.vec = load <64 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <128 x i32> + store <128 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor3_vf2(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor3_vf2: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %eax +; AVX512F-ONLY-NEXT: kmovw %k0, %ecx +; AVX512F-ONLY-NEXT: andb $1, %cl +; AVX512F-ONLY-NEXT: leal (%rcx,%rcx), %edx +; AVX512F-ONLY-NEXT: orb %cl, %dl +; AVX512F-ONLY-NEXT: shlb $2, %cl +; AVX512F-ONLY-NEXT: orb %dl, %cl +; AVX512F-ONLY-NEXT: movl %eax, %edx +; AVX512F-ONLY-NEXT: andb $1, %dl +; AVX512F-ONLY-NEXT: leal (,%rdx,8), %edi +; AVX512F-ONLY-NEXT: orb %cl, %dil +; AVX512F-ONLY-NEXT: shlb $4, %dl +; AVX512F-ONLY-NEXT: orb %dil, %dl +; AVX512F-ONLY-NEXT: shlb $5, %al +; AVX512F-ONLY-NEXT: orb %dl, %al +; AVX512F-ONLY-NEXT: andb $63, %al +; AVX512F-ONLY-NEXT: movb %al, (%rsi) +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor3_vf2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: kshiftrb $1, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: andb $1, %cl +; AVX512DQ-NEXT: leal (%rcx,%rcx), %edx +; AVX512DQ-NEXT: orb %cl, %dl +; AVX512DQ-NEXT: shlb $2, %cl +; AVX512DQ-NEXT: orb %dl, %cl +; AVX512DQ-NEXT: movl %eax, %edx +; AVX512DQ-NEXT: andb $1, %dl +; AVX512DQ-NEXT: leal (,%rdx,8), %edi +; AVX512DQ-NEXT: orb %cl, %dil +; AVX512DQ-NEXT: shlb $4, %dl +; AVX512DQ-NEXT: orb %dil, %dl +; AVX512DQ-NEXT: shlb $5, %al +; AVX512DQ-NEXT: orb %dl, %al +; AVX512DQ-NEXT: andb $63, %al +; AVX512DQ-NEXT: movb %al, (%rsi) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor3_vf2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: kshiftrw $1, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: andb $1, %cl +; AVX512BW-NEXT: leal (%rcx,%rcx), %edx +; AVX512BW-NEXT: orb %cl, %dl +; AVX512BW-NEXT: shlb $2, %cl +; AVX512BW-NEXT: orb %dl, %cl +; AVX512BW-NEXT: movl %eax, %edx +; AVX512BW-NEXT: andb $1, %dl +; AVX512BW-NEXT: leal (,%rdx,8), %edi +; AVX512BW-NEXT: orb %cl, %dil +; AVX512BW-NEXT: shlb $4, %dl +; AVX512BW-NEXT: orb %dil, %dl +; AVX512BW-NEXT: shlb $5, %al +; AVX512BW-NEXT: orb %dl, %al +; AVX512BW-NEXT: andb $63, %al +; AVX512BW-NEXT: movb %al, (%rsi) +; AVX512BW-NEXT: retq + %src.vec = load <2 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <6 x i32> + store <6 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor3_vf4(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor3_vf4: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k2 +; AVX512F-ONLY-NEXT: kshiftrw $3, %k2, %k0 +; AVX512F-ONLY-NEXT: kshiftrw $2, %k2, %k1 +; AVX512F-ONLY-NEXT: kshiftrw $1, %k2, %k3 +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leal (%rax,%rax,2), %ecx +; AVX512F-ONLY-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512F-ONLY-NEXT: movl %eax, %edx +; AVX512F-ONLY-NEXT: shll $4, %edx +; AVX512F-ONLY-NEXT: orl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $5, %eax +; AVX512F-ONLY-NEXT: orl %edx, %eax +; AVX512F-ONLY-NEXT: kmovw %k1, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $6, %edx +; AVX512F-ONLY-NEXT: movl %ecx, %edi +; AVX512F-ONLY-NEXT: shll $7, %edi +; AVX512F-ONLY-NEXT: orl %edx, %edi +; AVX512F-ONLY-NEXT: shll $8, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: kmovw %k0, %edx +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movl %edi, %r8d +; AVX512F-ONLY-NEXT: shll $9, %r8d +; AVX512F-ONLY-NEXT: orl %ecx, %r8d +; AVX512F-ONLY-NEXT: shll $10, %edi +; AVX512F-ONLY-NEXT: orl %r8d, %edi +; AVX512F-ONLY-NEXT: shll $11, %edx +; AVX512F-ONLY-NEXT: orl %edi, %edx +; AVX512F-ONLY-NEXT: orl %eax, %edx +; AVX512F-ONLY-NEXT: andl $4095, %edx # imm = 0xFFF +; AVX512F-ONLY-NEXT: movw %dx, (%rsi) +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor3_vf4: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k2 +; AVX512DQ-NEXT: kshiftrb $3, %k2, %k0 +; AVX512DQ-NEXT: kshiftrb $2, %k2, %k1 +; AVX512DQ-NEXT: kshiftrb $1, %k2, %k3 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leal (%rax,%rax,2), %ecx +; AVX512DQ-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512DQ-NEXT: movl %eax, %edx +; AVX512DQ-NEXT: shll $4, %edx +; AVX512DQ-NEXT: orl %ecx, %edx +; AVX512DQ-NEXT: shll $5, %eax +; AVX512DQ-NEXT: orl %edx, %eax +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movl %ecx, %edx +; AVX512DQ-NEXT: shll $6, %edx +; AVX512DQ-NEXT: movl %ecx, %edi +; AVX512DQ-NEXT: shll $7, %edi +; AVX512DQ-NEXT: orl %edx, %edi +; AVX512DQ-NEXT: shll $8, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: kmovw %k0, %edx +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movl %edi, %r8d +; AVX512DQ-NEXT: shll $9, %r8d +; AVX512DQ-NEXT: orl %ecx, %r8d +; AVX512DQ-NEXT: shll $10, %edi +; AVX512DQ-NEXT: orl %r8d, %edi +; AVX512DQ-NEXT: shll $11, %edx +; AVX512DQ-NEXT: orl %edi, %edx +; AVX512DQ-NEXT: orl %eax, %edx +; AVX512DQ-NEXT: andl $4095, %edx # imm = 0xFFF +; AVX512DQ-NEXT: movw %dx, (%rsi) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor3_vf4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k2 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k3 +; AVX512BW-NEXT: kmovd %k2, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leal (%rax,%rax,2), %ecx +; AVX512BW-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512BW-NEXT: kmovd %k3, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512BW-NEXT: movl %eax, %edx +; AVX512BW-NEXT: shll $4, %edx +; AVX512BW-NEXT: orl %ecx, %edx +; AVX512BW-NEXT: shll $5, %eax +; AVX512BW-NEXT: orl %edx, %eax +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movl %ecx, %edx +; AVX512BW-NEXT: shll $6, %edx +; AVX512BW-NEXT: movl %ecx, %edi +; AVX512BW-NEXT: shll $7, %edi +; AVX512BW-NEXT: orl %edx, %edi +; AVX512BW-NEXT: shll $8, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: kmovd %k0, %edx +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movl %edi, %r8d +; AVX512BW-NEXT: shll $9, %r8d +; AVX512BW-NEXT: orl %ecx, %r8d +; AVX512BW-NEXT: shll $10, %edi +; AVX512BW-NEXT: orl %r8d, %edi +; AVX512BW-NEXT: shll $11, %edx +; AVX512BW-NEXT: orl %edi, %edx +; AVX512BW-NEXT: orl %eax, %edx +; AVX512BW-NEXT: andl $4095, %edx # imm = 0xFFF +; AVX512BW-NEXT: movw %dx, (%rsi) +; AVX512BW-NEXT: retq + %src.vec = load <4 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <12 x i32> + store <12 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor3_vf8(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor3_vf8: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k6 +; AVX512F-ONLY-NEXT: kshiftrw $7, %k6, %k0 +; AVX512F-ONLY-NEXT: kshiftrw $6, %k6, %k1 +; AVX512F-ONLY-NEXT: kshiftrw $5, %k6, %k2 +; AVX512F-ONLY-NEXT: kshiftrw $4, %k6, %k3 +; AVX512F-ONLY-NEXT: kshiftrw $3, %k6, %k4 +; AVX512F-ONLY-NEXT: kshiftrw $2, %k6, %k5 +; AVX512F-ONLY-NEXT: kshiftrw $1, %k6, %k7 +; AVX512F-ONLY-NEXT: kmovw %k6, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leal (%rax,%rax,2), %ecx +; AVX512F-ONLY-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512F-ONLY-NEXT: kmovw %k7, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512F-ONLY-NEXT: movl %eax, %edx +; AVX512F-ONLY-NEXT: shll $4, %edx +; AVX512F-ONLY-NEXT: orl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $5, %eax +; AVX512F-ONLY-NEXT: orl %edx, %eax +; AVX512F-ONLY-NEXT: kmovw %k5, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $6, %edx +; AVX512F-ONLY-NEXT: movl %ecx, %edi +; AVX512F-ONLY-NEXT: shll $7, %edi +; AVX512F-ONLY-NEXT: orl %edx, %edi +; AVX512F-ONLY-NEXT: shll $8, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: kmovw %k4, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: shll $9, %edi +; AVX512F-ONLY-NEXT: orl %ecx, %edi +; AVX512F-ONLY-NEXT: movl %edx, %ecx +; AVX512F-ONLY-NEXT: shll $10, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: shll $11, %edx +; AVX512F-ONLY-NEXT: orl %ecx, %edx +; AVX512F-ONLY-NEXT: kmovw %k3, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movl %edi, %ecx +; AVX512F-ONLY-NEXT: shll $12, %ecx +; AVX512F-ONLY-NEXT: orl %edx, %ecx +; AVX512F-ONLY-NEXT: movl %edi, %edx +; AVX512F-ONLY-NEXT: shll $13, %edx +; AVX512F-ONLY-NEXT: orl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $14, %edi +; AVX512F-ONLY-NEXT: orl %edx, %edi +; AVX512F-ONLY-NEXT: kmovw %k2, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movl %edx, %ecx +; AVX512F-ONLY-NEXT: shll $15, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: orl %eax, %ecx +; AVX512F-ONLY-NEXT: movw %cx, (%rsi) +; AVX512F-ONLY-NEXT: movl %edx, %eax +; AVX512F-ONLY-NEXT: shll $16, %eax +; AVX512F-ONLY-NEXT: shll $17, %edx +; AVX512F-ONLY-NEXT: orl %eax, %edx +; AVX512F-ONLY-NEXT: kmovw %k1, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movl %eax, %edi +; AVX512F-ONLY-NEXT: shll $18, %edi +; AVX512F-ONLY-NEXT: orl %edx, %edi +; AVX512F-ONLY-NEXT: movl %eax, %edx +; AVX512F-ONLY-NEXT: shll $19, %edx +; AVX512F-ONLY-NEXT: orl %edi, %edx +; AVX512F-ONLY-NEXT: shll $20, %eax +; AVX512F-ONLY-NEXT: orl %edx, %eax +; AVX512F-ONLY-NEXT: kmovw %k0, %edx +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movl %edi, %r8d +; AVX512F-ONLY-NEXT: shll $21, %r8d +; AVX512F-ONLY-NEXT: orl %eax, %r8d +; AVX512F-ONLY-NEXT: shll $22, %edi +; AVX512F-ONLY-NEXT: orl %r8d, %edi +; AVX512F-ONLY-NEXT: shll $23, %edx +; AVX512F-ONLY-NEXT: orl %edi, %edx +; AVX512F-ONLY-NEXT: orl %ecx, %edx +; AVX512F-ONLY-NEXT: shrl $16, %edx +; AVX512F-ONLY-NEXT: movb %dl, 2(%rsi) +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor3_vf8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k6 +; AVX512DQ-NEXT: kshiftrb $7, %k6, %k0 +; AVX512DQ-NEXT: kshiftrb $6, %k6, %k1 +; AVX512DQ-NEXT: kshiftrb $5, %k6, %k2 +; AVX512DQ-NEXT: kshiftrb $4, %k6, %k3 +; AVX512DQ-NEXT: kshiftrb $3, %k6, %k4 +; AVX512DQ-NEXT: kshiftrb $2, %k6, %k5 +; AVX512DQ-NEXT: kshiftrb $1, %k6, %k7 +; AVX512DQ-NEXT: kmovw %k6, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leal (%rax,%rax,2), %ecx +; AVX512DQ-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512DQ-NEXT: kmovw %k7, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512DQ-NEXT: movl %eax, %edx +; AVX512DQ-NEXT: shll $4, %edx +; AVX512DQ-NEXT: orl %ecx, %edx +; AVX512DQ-NEXT: shll $5, %eax +; AVX512DQ-NEXT: orl %edx, %eax +; AVX512DQ-NEXT: kmovw %k5, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movl %ecx, %edx +; AVX512DQ-NEXT: shll $6, %edx +; AVX512DQ-NEXT: movl %ecx, %edi +; AVX512DQ-NEXT: shll $7, %edi +; AVX512DQ-NEXT: orl %edx, %edi +; AVX512DQ-NEXT: shll $8, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: kmovw %k4, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: shll $9, %edi +; AVX512DQ-NEXT: orl %ecx, %edi +; AVX512DQ-NEXT: movl %edx, %ecx +; AVX512DQ-NEXT: shll $10, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: shll $11, %edx +; AVX512DQ-NEXT: orl %ecx, %edx +; AVX512DQ-NEXT: kmovw %k3, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movl %edi, %ecx +; AVX512DQ-NEXT: shll $12, %ecx +; AVX512DQ-NEXT: orl %edx, %ecx +; AVX512DQ-NEXT: movl %edi, %edx +; AVX512DQ-NEXT: shll $13, %edx +; AVX512DQ-NEXT: orl %ecx, %edx +; AVX512DQ-NEXT: shll $14, %edi +; AVX512DQ-NEXT: orl %edx, %edi +; AVX512DQ-NEXT: kmovw %k2, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movl %edx, %ecx +; AVX512DQ-NEXT: shll $15, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: orl %eax, %ecx +; AVX512DQ-NEXT: movw %cx, (%rsi) +; AVX512DQ-NEXT: movl %edx, %eax +; AVX512DQ-NEXT: shll $16, %eax +; AVX512DQ-NEXT: shll $17, %edx +; AVX512DQ-NEXT: orl %eax, %edx +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movl %eax, %edi +; AVX512DQ-NEXT: shll $18, %edi +; AVX512DQ-NEXT: orl %edx, %edi +; AVX512DQ-NEXT: movl %eax, %edx +; AVX512DQ-NEXT: shll $19, %edx +; AVX512DQ-NEXT: orl %edi, %edx +; AVX512DQ-NEXT: shll $20, %eax +; AVX512DQ-NEXT: orl %edx, %eax +; AVX512DQ-NEXT: kmovw %k0, %edx +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movl %edi, %r8d +; AVX512DQ-NEXT: shll $21, %r8d +; AVX512DQ-NEXT: orl %eax, %r8d +; AVX512DQ-NEXT: shll $22, %edi +; AVX512DQ-NEXT: orl %r8d, %edi +; AVX512DQ-NEXT: shll $23, %edx +; AVX512DQ-NEXT: orl %edi, %edx +; AVX512DQ-NEXT: orl %ecx, %edx +; AVX512DQ-NEXT: shrl $16, %edx +; AVX512DQ-NEXT: movb %dl, 2(%rsi) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor3_vf8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k6 +; AVX512BW-NEXT: kshiftrw $7, %k6, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k6, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k6, %k3 +; AVX512BW-NEXT: kshiftrw $3, %k6, %k4 +; AVX512BW-NEXT: kshiftrw $2, %k6, %k5 +; AVX512BW-NEXT: kshiftrw $1, %k6, %k7 +; AVX512BW-NEXT: kmovd %k6, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leal (%rax,%rax,2), %ecx +; AVX512BW-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512BW-NEXT: kmovd %k7, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512BW-NEXT: movl %eax, %edx +; AVX512BW-NEXT: shll $4, %edx +; AVX512BW-NEXT: orl %ecx, %edx +; AVX512BW-NEXT: shll $5, %eax +; AVX512BW-NEXT: orl %edx, %eax +; AVX512BW-NEXT: kmovd %k5, %ecx +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movl %ecx, %edx +; AVX512BW-NEXT: shll $6, %edx +; AVX512BW-NEXT: movl %ecx, %edi +; AVX512BW-NEXT: shll $7, %edi +; AVX512BW-NEXT: orl %edx, %edi +; AVX512BW-NEXT: shll $8, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: kmovd %k4, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: shll $9, %edi +; AVX512BW-NEXT: orl %ecx, %edi +; AVX512BW-NEXT: movl %edx, %ecx +; AVX512BW-NEXT: shll $10, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: shll $11, %edx +; AVX512BW-NEXT: orl %ecx, %edx +; AVX512BW-NEXT: kmovd %k3, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movl %edi, %ecx +; AVX512BW-NEXT: shll $12, %ecx +; AVX512BW-NEXT: orl %edx, %ecx +; AVX512BW-NEXT: movl %edi, %edx +; AVX512BW-NEXT: shll $13, %edx +; AVX512BW-NEXT: orl %ecx, %edx +; AVX512BW-NEXT: shll $14, %edi +; AVX512BW-NEXT: orl %edx, %edi +; AVX512BW-NEXT: kmovd %k2, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movl %edx, %ecx +; AVX512BW-NEXT: shll $15, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: orl %eax, %ecx +; AVX512BW-NEXT: movw %cx, (%rsi) +; AVX512BW-NEXT: movl %edx, %eax +; AVX512BW-NEXT: shll $16, %eax +; AVX512BW-NEXT: shll $17, %edx +; AVX512BW-NEXT: orl %eax, %edx +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movl %eax, %edi +; AVX512BW-NEXT: shll $18, %edi +; AVX512BW-NEXT: orl %edx, %edi +; AVX512BW-NEXT: movl %eax, %edx +; AVX512BW-NEXT: shll $19, %edx +; AVX512BW-NEXT: orl %edi, %edx +; AVX512BW-NEXT: shll $20, %eax +; AVX512BW-NEXT: orl %edx, %eax +; AVX512BW-NEXT: kmovd %k0, %edx +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movl %edi, %r8d +; AVX512BW-NEXT: shll $21, %r8d +; AVX512BW-NEXT: orl %eax, %r8d +; AVX512BW-NEXT: shll $22, %edi +; AVX512BW-NEXT: orl %r8d, %edi +; AVX512BW-NEXT: shll $23, %edx +; AVX512BW-NEXT: orl %edi, %edx +; AVX512BW-NEXT: orl %ecx, %edx +; AVX512BW-NEXT: shrl $16, %edx +; AVX512BW-NEXT: movb %dl, 2(%rsi) +; AVX512BW-NEXT: retq + %src.vec = load <8 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <24 x i32> + store <24 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor3_vf16(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor3_vf16: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbp +; AVX512F-ONLY-NEXT: pushq %r15 +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %r12 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4 +; AVX512F-ONLY-NEXT: kshiftrw $15, %k4, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, %eax +; AVX512F-ONLY-NEXT: kshiftrw $14, %k4, %k0 +; AVX512F-ONLY-NEXT: kshiftrw $13, %k4, %k1 +; AVX512F-ONLY-NEXT: kshiftrw $12, %k4, %k2 +; AVX512F-ONLY-NEXT: kshiftrw $11, %k4, %k3 +; AVX512F-ONLY-NEXT: kshiftrw $10, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $9, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %edx +; AVX512F-ONLY-NEXT: kshiftrw $8, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $7, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $3, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $2, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %edi +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: movzbl %r15b, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: leaq (%r15,%r15,2), %r12 +; AVX512F-ONLY-NEXT: leaq (%r12,%r15,4), %r15 +; AVX512F-ONLY-NEXT: movzbl %dil, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: leaq (%r15,%rdi,8), %r15 +; AVX512F-ONLY-NEXT: movq %rdi, %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $5, %rdi +; AVX512F-ONLY-NEXT: orq %r12, %rdi +; AVX512F-ONLY-NEXT: movzbl %r14b, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %r15 +; AVX512F-ONLY-NEXT: shlq $6, %r15 +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $7, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $8, %r14 +; AVX512F-ONLY-NEXT: orq %r12, %r14 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $9, %r12 +; AVX512F-ONLY-NEXT: orq %r14, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r14 +; AVX512F-ONLY-NEXT: shlq $10, %r14 +; AVX512F-ONLY-NEXT: orq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $11, %r15 +; AVX512F-ONLY-NEXT: orq %r14, %r15 +; AVX512F-ONLY-NEXT: movzbl %bl, %ebx +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %r14 +; AVX512F-ONLY-NEXT: shlq $12, %r14 +; AVX512F-ONLY-NEXT: orq %r15, %r14 +; AVX512F-ONLY-NEXT: movq %rbx, %r15 +; AVX512F-ONLY-NEXT: shlq $13, %r15 +; AVX512F-ONLY-NEXT: orq %r14, %r15 +; AVX512F-ONLY-NEXT: shlq $14, %rbx +; AVX512F-ONLY-NEXT: orq %r15, %rbx +; AVX512F-ONLY-NEXT: movzbl %r11b, %r11d +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: movq %r11, %r14 +; AVX512F-ONLY-NEXT: shlq $15, %r14 +; AVX512F-ONLY-NEXT: orq %rbx, %r14 +; AVX512F-ONLY-NEXT: movq %r11, %rbx +; AVX512F-ONLY-NEXT: shlq $16, %rbx +; AVX512F-ONLY-NEXT: orq %r14, %rbx +; AVX512F-ONLY-NEXT: shlq $17, %r11 +; AVX512F-ONLY-NEXT: orq %rbx, %r11 +; AVX512F-ONLY-NEXT: movzbl %r10b, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rbx +; AVX512F-ONLY-NEXT: shlq $18, %rbx +; AVX512F-ONLY-NEXT: orq %r11, %rbx +; AVX512F-ONLY-NEXT: movq %r10, %r11 +; AVX512F-ONLY-NEXT: shlq $19, %r11 +; AVX512F-ONLY-NEXT: orq %rbx, %r11 +; AVX512F-ONLY-NEXT: shlq $20, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: movzbl %r9b, %r9d +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %r11 +; AVX512F-ONLY-NEXT: shlq $21, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $22, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: shlq $23, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movzbl %r8b, %r8d +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %r10 +; AVX512F-ONLY-NEXT: shlq $24, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $25, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: shlq $26, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: movzbl %dl, %r9d +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %rdx +; AVX512F-ONLY-NEXT: shlq $27, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: movq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $28, %r8 +; AVX512F-ONLY-NEXT: orq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $29, %r9 +; AVX512F-ONLY-NEXT: orq %r8, %r9 +; AVX512F-ONLY-NEXT: movzbl %cl, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $30, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $31, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movl %edx, (%rsi) +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: shlq $32, %rcx +; AVX512F-ONLY-NEXT: kmovw %k3, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $33, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rcx +; AVX512F-ONLY-NEXT: shlq $34, %rcx +; AVX512F-ONLY-NEXT: orq %r8, %rcx +; AVX512F-ONLY-NEXT: shlq $35, %rdi +; AVX512F-ONLY-NEXT: orq %rcx, %rdi +; AVX512F-ONLY-NEXT: kmovw %k2, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %r8 +; AVX512F-ONLY-NEXT: shlq $36, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $37, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: shlq $38, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: kmovw %k1, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $39, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rcx +; AVX512F-ONLY-NEXT: shlq $40, %rcx +; AVX512F-ONLY-NEXT: orq %r8, %rcx +; AVX512F-ONLY-NEXT: shlq $41, %rdi +; AVX512F-ONLY-NEXT: orq %rcx, %rdi +; AVX512F-ONLY-NEXT: kmovw %k0, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %r8 +; AVX512F-ONLY-NEXT: shlq $42, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $43, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: shlq $44, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: movzbl %al, %edi +; AVX512F-ONLY-NEXT: # kill: def $eax killed $eax def $rax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $45, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $47, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: shrq $32, %rdi +; AVX512F-ONLY-NEXT: movw %di, 4(%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r12 +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: popq %r15 +; AVX512F-ONLY-NEXT: popq %rbp +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor3_vf16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovw (%rdi), %k4 +; AVX512DQ-NEXT: kshiftrw $15, %k4, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: kshiftrw $14, %k4, %k0 +; AVX512DQ-NEXT: kshiftrw $13, %k4, %k1 +; AVX512DQ-NEXT: kshiftrw $12, %k4, %k2 +; AVX512DQ-NEXT: kshiftrw $11, %k4, %k3 +; AVX512DQ-NEXT: kshiftrw $10, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %ecx +; AVX512DQ-NEXT: kshiftrw $9, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %edx +; AVX512DQ-NEXT: kshiftrw $8, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %r8d +; AVX512DQ-NEXT: kshiftrw $7, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %r9d +; AVX512DQ-NEXT: kshiftrw $6, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %r10d +; AVX512DQ-NEXT: kshiftrw $5, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %r11d +; AVX512DQ-NEXT: kshiftrw $4, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %ebx +; AVX512DQ-NEXT: kshiftrw $3, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %ebp +; AVX512DQ-NEXT: kshiftrw $2, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %r14d +; AVX512DQ-NEXT: kshiftrw $1, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %edi +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: movzbl %r15b, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: leaq (%r15,%r15,2), %r12 +; AVX512DQ-NEXT: leaq (%r12,%r15,4), %r15 +; AVX512DQ-NEXT: movzbl %dil, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: leaq (%r15,%rdi,8), %r15 +; AVX512DQ-NEXT: movq %rdi, %r12 +; AVX512DQ-NEXT: shlq $4, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: shlq $5, %rdi +; AVX512DQ-NEXT: orq %r12, %rdi +; AVX512DQ-NEXT: movzbl %r14b, %r14d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %r15 +; AVX512DQ-NEXT: shlq $6, %r15 +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $7, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: shlq $8, %r14 +; AVX512DQ-NEXT: orq %r12, %r14 +; AVX512DQ-NEXT: movzbl %bpl, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $9, %r12 +; AVX512DQ-NEXT: orq %r14, %r12 +; AVX512DQ-NEXT: movq %r15, %r14 +; AVX512DQ-NEXT: shlq $10, %r14 +; AVX512DQ-NEXT: orq %r12, %r14 +; AVX512DQ-NEXT: shlq $11, %r15 +; AVX512DQ-NEXT: orq %r14, %r15 +; AVX512DQ-NEXT: movzbl %bl, %ebx +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %r14 +; AVX512DQ-NEXT: shlq $12, %r14 +; AVX512DQ-NEXT: orq %r15, %r14 +; AVX512DQ-NEXT: movq %rbx, %r15 +; AVX512DQ-NEXT: shlq $13, %r15 +; AVX512DQ-NEXT: orq %r14, %r15 +; AVX512DQ-NEXT: shlq $14, %rbx +; AVX512DQ-NEXT: orq %r15, %rbx +; AVX512DQ-NEXT: movzbl %r11b, %r11d +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: movq %r11, %r14 +; AVX512DQ-NEXT: shlq $15, %r14 +; AVX512DQ-NEXT: orq %rbx, %r14 +; AVX512DQ-NEXT: movq %r11, %rbx +; AVX512DQ-NEXT: shlq $16, %rbx +; AVX512DQ-NEXT: orq %r14, %rbx +; AVX512DQ-NEXT: shlq $17, %r11 +; AVX512DQ-NEXT: orq %rbx, %r11 +; AVX512DQ-NEXT: movzbl %r10b, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rbx +; AVX512DQ-NEXT: shlq $18, %rbx +; AVX512DQ-NEXT: orq %r11, %rbx +; AVX512DQ-NEXT: movq %r10, %r11 +; AVX512DQ-NEXT: shlq $19, %r11 +; AVX512DQ-NEXT: orq %rbx, %r11 +; AVX512DQ-NEXT: shlq $20, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: movzbl %r9b, %r9d +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %r11 +; AVX512DQ-NEXT: shlq $21, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movq %r9, %r10 +; AVX512DQ-NEXT: shlq $22, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: shlq $23, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movzbl %r8b, %r8d +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %r10 +; AVX512DQ-NEXT: shlq $24, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $25, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: shlq $26, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: movzbl %dl, %r9d +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %rdx +; AVX512DQ-NEXT: shlq $27, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: movq %r9, %r8 +; AVX512DQ-NEXT: shlq $28, %r8 +; AVX512DQ-NEXT: orq %rdx, %r8 +; AVX512DQ-NEXT: shlq $29, %r9 +; AVX512DQ-NEXT: orq %r8, %r9 +; AVX512DQ-NEXT: movzbl %cl, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movq %rdx, %r8 +; AVX512DQ-NEXT: shlq $30, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: shlq $31, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movl %edx, (%rsi) +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: shlq $32, %rcx +; AVX512DQ-NEXT: kmovw %k3, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $33, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: movq %rdi, %rcx +; AVX512DQ-NEXT: shlq $34, %rcx +; AVX512DQ-NEXT: orq %r8, %rcx +; AVX512DQ-NEXT: shlq $35, %rdi +; AVX512DQ-NEXT: orq %rcx, %rdi +; AVX512DQ-NEXT: kmovw %k2, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %r8 +; AVX512DQ-NEXT: shlq $36, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $37, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: shlq $38, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: kmovw %k1, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $39, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: movq %rdi, %rcx +; AVX512DQ-NEXT: shlq $40, %rcx +; AVX512DQ-NEXT: orq %r8, %rcx +; AVX512DQ-NEXT: shlq $41, %rdi +; AVX512DQ-NEXT: orq %rcx, %rdi +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %r8 +; AVX512DQ-NEXT: shlq $42, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $43, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: shlq $44, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: movzbl %al, %edi +; AVX512DQ-NEXT: # kill: def $eax killed $eax def $rax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $45, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: shlq $47, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: shrq $32, %rdi +; AVX512DQ-NEXT: movw %di, 4(%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor3_vf16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovw (%rdi), %k4 +; AVX512BW-NEXT: kshiftrw $15, %k4, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: kshiftrw $14, %k4, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k3 +; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %ecx +; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %edx +; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %r8d +; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %r9d +; AVX512BW-NEXT: kshiftrw $6, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %r10d +; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %r11d +; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %ebx +; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %ebp +; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %r14d +; AVX512BW-NEXT: kshiftrw $1, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %edi +; AVX512BW-NEXT: kmovd %k4, %r15d +; AVX512BW-NEXT: movzbl %r15b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: leaq (%r15,%r15,2), %r12 +; AVX512BW-NEXT: leaq (%r12,%r15,4), %r15 +; AVX512BW-NEXT: movzbl %dil, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: leaq (%r15,%rdi,8), %r15 +; AVX512BW-NEXT: movq %rdi, %r12 +; AVX512BW-NEXT: shlq $4, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: shlq $5, %rdi +; AVX512BW-NEXT: orq %r12, %rdi +; AVX512BW-NEXT: movzbl %r14b, %r14d +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %r15 +; AVX512BW-NEXT: shlq $6, %r15 +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: shlq $8, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: movzbl %bpl, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $9, %r12 +; AVX512BW-NEXT: orq %r14, %r12 +; AVX512BW-NEXT: movq %r15, %r14 +; AVX512BW-NEXT: shlq $10, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: shlq $11, %r15 +; AVX512BW-NEXT: orq %r14, %r15 +; AVX512BW-NEXT: movzbl %bl, %ebx +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %r14 +; AVX512BW-NEXT: shlq $12, %r14 +; AVX512BW-NEXT: orq %r15, %r14 +; AVX512BW-NEXT: movq %rbx, %r15 +; AVX512BW-NEXT: shlq $13, %r15 +; AVX512BW-NEXT: orq %r14, %r15 +; AVX512BW-NEXT: shlq $14, %rbx +; AVX512BW-NEXT: orq %r15, %rbx +; AVX512BW-NEXT: movzbl %r11b, %r11d +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %r14 +; AVX512BW-NEXT: shlq $15, %r14 +; AVX512BW-NEXT: orq %rbx, %r14 +; AVX512BW-NEXT: movq %r11, %rbx +; AVX512BW-NEXT: shlq $16, %rbx +; AVX512BW-NEXT: orq %r14, %rbx +; AVX512BW-NEXT: shlq $17, %r11 +; AVX512BW-NEXT: orq %rbx, %r11 +; AVX512BW-NEXT: movzbl %r10b, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %rbx +; AVX512BW-NEXT: shlq $18, %rbx +; AVX512BW-NEXT: orq %r11, %rbx +; AVX512BW-NEXT: movq %r10, %r11 +; AVX512BW-NEXT: shlq $19, %r11 +; AVX512BW-NEXT: orq %rbx, %r11 +; AVX512BW-NEXT: shlq $20, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: movzbl %r9b, %r9d +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %r11 +; AVX512BW-NEXT: shlq $21, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: shlq $22, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: shlq $23, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movzbl %r8b, %r8d +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: movq %r8, %r10 +; AVX512BW-NEXT: shlq $24, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %r8, %r9 +; AVX512BW-NEXT: shlq $25, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: shlq $26, %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: movzbl %dl, %r9d +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %rdx +; AVX512BW-NEXT: shlq $27, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: movq %r9, %r8 +; AVX512BW-NEXT: shlq $28, %r8 +; AVX512BW-NEXT: orq %rdx, %r8 +; AVX512BW-NEXT: shlq $29, %r9 +; AVX512BW-NEXT: orq %r8, %r9 +; AVX512BW-NEXT: movzbl %cl, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %r8 +; AVX512BW-NEXT: shlq $30, %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: shlq $31, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movl %edx, (%rsi) +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: shlq $32, %rcx +; AVX512BW-NEXT: kmovd %k3, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $33, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: movq %rdi, %rcx +; AVX512BW-NEXT: shlq $34, %rcx +; AVX512BW-NEXT: orq %r8, %rcx +; AVX512BW-NEXT: shlq $35, %rdi +; AVX512BW-NEXT: orq %rcx, %rdi +; AVX512BW-NEXT: kmovd %k2, %ecx +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %r8 +; AVX512BW-NEXT: shlq $36, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $37, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: shlq $38, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $39, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: movq %rdi, %rcx +; AVX512BW-NEXT: shlq $40, %rcx +; AVX512BW-NEXT: orq %r8, %rcx +; AVX512BW-NEXT: shlq $41, %rdi +; AVX512BW-NEXT: orq %rcx, %rdi +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %r8 +; AVX512BW-NEXT: shlq $42, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $43, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: shlq $44, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: movzbl %al, %edi +; AVX512BW-NEXT: # kill: def $eax killed $eax def $rax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $45, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: shlq $47, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: shrq $32, %rdi +; AVX512BW-NEXT: movw %di, 4(%rsi) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: retq + %src.vec = load <16 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <48 x i32> + store <48 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor3_vf32(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor3_vf32: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbp +; AVX512F-ONLY-NEXT: pushq %r15 +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %r13 +; AVX512F-ONLY-NEXT: pushq %r12 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k0 +; AVX512F-ONLY-NEXT: kshiftrw $5, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-ONLY-NEXT: kshiftrw $10, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $9, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %edx +; AVX512F-ONLY-NEXT: kshiftrw $8, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %edi +; AVX512F-ONLY-NEXT: kshiftrw $7, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $6, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $7, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k0, %k2 +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movl %eax, %ebp +; AVX512F-ONLY-NEXT: andl $1, %ebp +; AVX512F-ONLY-NEXT: leal (%rbp,%rbp,2), %ebp +; AVX512F-ONLY-NEXT: leal (%rbp,%r14,4), %ebp +; AVX512F-ONLY-NEXT: leal (%rbp,%r14,8), %ebp +; AVX512F-ONLY-NEXT: shll $4, %r14d +; AVX512F-ONLY-NEXT: orl %ebp, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movl %r15d, %ebp +; AVX512F-ONLY-NEXT: shll $5, %ebp +; AVX512F-ONLY-NEXT: orl %r14d, %ebp +; AVX512F-ONLY-NEXT: movl %r15d, %r14d +; AVX512F-ONLY-NEXT: shll $6, %r14d +; AVX512F-ONLY-NEXT: shll $7, %r15d +; AVX512F-ONLY-NEXT: orl %r14d, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movl %r12d, %r14d +; AVX512F-ONLY-NEXT: shll $8, %r14d +; AVX512F-ONLY-NEXT: orl %r15d, %r14d +; AVX512F-ONLY-NEXT: movl %r12d, %r15d +; AVX512F-ONLY-NEXT: shll $9, %r15d +; AVX512F-ONLY-NEXT: orl %r14d, %r15d +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: kshiftrw $11, %k0, %k2 +; AVX512F-ONLY-NEXT: shll $10, %r12d +; AVX512F-ONLY-NEXT: orl %r15d, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movl %r13d, %r14d +; AVX512F-ONLY-NEXT: shll $11, %r14d +; AVX512F-ONLY-NEXT: orl %r12d, %r14d +; AVX512F-ONLY-NEXT: movl %r13d, %r15d +; AVX512F-ONLY-NEXT: shll $12, %r15d +; AVX512F-ONLY-NEXT: orl %r14d, %r15d +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k0, %k2 +; AVX512F-ONLY-NEXT: shll $13, %r13d +; AVX512F-ONLY-NEXT: orl %r15d, %r13d +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movl %eax, %r14d +; AVX512F-ONLY-NEXT: shll $14, %r14d +; AVX512F-ONLY-NEXT: orl %r13d, %r14d +; AVX512F-ONLY-NEXT: movl %eax, %r15d +; AVX512F-ONLY-NEXT: shll $15, %r15d +; AVX512F-ONLY-NEXT: orl %r14d, %r15d +; AVX512F-ONLY-NEXT: kmovw %k2, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k0, %k2 +; AVX512F-ONLY-NEXT: shll $16, %eax +; AVX512F-ONLY-NEXT: orl %r15d, %eax +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movl %r12d, %r15d +; AVX512F-ONLY-NEXT: shll $17, %r15d +; AVX512F-ONLY-NEXT: orl %eax, %r15d +; AVX512F-ONLY-NEXT: movl %r12d, %eax +; AVX512F-ONLY-NEXT: shll $18, %eax +; AVX512F-ONLY-NEXT: orl %r15d, %eax +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k0, %k2 +; AVX512F-ONLY-NEXT: shll $19, %r12d +; AVX512F-ONLY-NEXT: orl %eax, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movl %r14d, %eax +; AVX512F-ONLY-NEXT: shll $20, %eax +; AVX512F-ONLY-NEXT: orl %r12d, %eax +; AVX512F-ONLY-NEXT: movl %r14d, %r12d +; AVX512F-ONLY-NEXT: shll $21, %r12d +; AVX512F-ONLY-NEXT: orl %eax, %r12d +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: kshiftrw $15, %k0, %k2 +; AVX512F-ONLY-NEXT: shll $22, %r14d +; AVX512F-ONLY-NEXT: orl %r12d, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movl %r15d, %r12d +; AVX512F-ONLY-NEXT: shll $23, %r12d +; AVX512F-ONLY-NEXT: orl %r14d, %r12d +; AVX512F-ONLY-NEXT: movl %r15d, %r13d +; AVX512F-ONLY-NEXT: shll $24, %r13d +; AVX512F-ONLY-NEXT: orl %r12d, %r13d +; AVX512F-ONLY-NEXT: kmovw %k2, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k1, %k2 +; AVX512F-ONLY-NEXT: shll $25, %r15d +; AVX512F-ONLY-NEXT: orl %r13d, %r15d +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movl %eax, %r12d +; AVX512F-ONLY-NEXT: shll $26, %r12d +; AVX512F-ONLY-NEXT: orl %r15d, %r12d +; AVX512F-ONLY-NEXT: movl %eax, %r15d +; AVX512F-ONLY-NEXT: shll $27, %r15d +; AVX512F-ONLY-NEXT: orl %r12d, %r15d +; AVX512F-ONLY-NEXT: shll $28, %eax +; AVX512F-ONLY-NEXT: orl %r15d, %eax +; AVX512F-ONLY-NEXT: movl %r14d, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movl %r13d, %r12d +; AVX512F-ONLY-NEXT: shll $29, %r12d +; AVX512F-ONLY-NEXT: orl %eax, %r12d +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k1, %k2 +; AVX512F-ONLY-NEXT: shll $30, %r13d +; AVX512F-ONLY-NEXT: orl %r12d, %r13d +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k1, %k2 +; AVX512F-ONLY-NEXT: shll $31, %r14d +; AVX512F-ONLY-NEXT: orl %r13d, %r14d +; AVX512F-ONLY-NEXT: kmovw %k2, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k1, %k2 +; AVX512F-ONLY-NEXT: orl %ebp, %r14d +; AVX512F-ONLY-NEXT: movl %r14d, 8(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k1, %eax +; AVX512F-ONLY-NEXT: movzbl %al, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leaq (%rax,%rax,2), %r14 +; AVX512F-ONLY-NEXT: leaq (%r14,%rax,4), %rax +; AVX512F-ONLY-NEXT: movzbl %r12b, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: leaq (%rax,%r14,8), %rax +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $5, %r14 +; AVX512F-ONLY-NEXT: orq %r12, %r14 +; AVX512F-ONLY-NEXT: movzbl %r15b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $6, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $7, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %bl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rbx +; AVX512F-ONLY-NEXT: shlq $9, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $14, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $11, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %r11b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r11 +; AVX512F-ONLY-NEXT: shlq $12, %r11 +; AVX512F-ONLY-NEXT: orq %r12, %r11 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $13, %r12 +; AVX512F-ONLY-NEXT: orq %r11, %r12 +; AVX512F-ONLY-NEXT: kmovw %k2, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %r10b, %ebp +; AVX512F-ONLY-NEXT: andl $1, %ebp +; AVX512F-ONLY-NEXT: movq %rbp, %r10 +; AVX512F-ONLY-NEXT: shlq $15, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $17, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: movzbl %r9b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $18, %rax +; AVX512F-ONLY-NEXT: orq %rbp, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rbp +; AVX512F-ONLY-NEXT: shlq $19, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: kmovw %k1, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $20, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: movzbl %r8b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $21, %r8 +; AVX512F-ONLY-NEXT: orq %r12, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $22, %r12 +; AVX512F-ONLY-NEXT: orq %r8, %r12 +; AVX512F-ONLY-NEXT: kmovw %k1, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $23, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %dil, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rdi +; AVX512F-ONLY-NEXT: shlq $24, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $25, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %edi +; AVX512F-ONLY-NEXT: shlq $26, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %dl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $27, %rdx +; AVX512F-ONLY-NEXT: orq %r12, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $28, %r12 +; AVX512F-ONLY-NEXT: orq %rdx, %r12 +; AVX512F-ONLY-NEXT: shlq $29, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %cl, %ebp +; AVX512F-ONLY-NEXT: andl $1, %ebp +; AVX512F-ONLY-NEXT: movq %rbp, %rdx +; AVX512F-ONLY-NEXT: shlq $30, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: kmovw %k0, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k0, %k0 +; AVX512F-ONLY-NEXT: shlq $31, %rbp +; AVX512F-ONLY-NEXT: orq %rdx, %rbp +; AVX512F-ONLY-NEXT: kmovw %k0, %edx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: shlq $32, %rcx +; AVX512F-ONLY-NEXT: orq %rbp, %rcx +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $34, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $35, %r13 +; AVX512F-ONLY-NEXT: orq %rcx, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rcx +; AVX512F-ONLY-NEXT: shlq $37, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $38, %r15 +; AVX512F-ONLY-NEXT: orq %rcx, %r15 +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $39, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %rcx +; AVX512F-ONLY-NEXT: shlq $40, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $41, %rbx +; AVX512F-ONLY-NEXT: orq %rcx, %rbx +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: movq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r11, %rcx +; AVX512F-ONLY-NEXT: shlq $43, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $44, %r11 +; AVX512F-ONLY-NEXT: orq %rcx, %r11 +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $45, %rax +; AVX512F-ONLY-NEXT: orq %r11, %rax +; AVX512F-ONLY-NEXT: movq %r10, %rcx +; AVX512F-ONLY-NEXT: shlq $46, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $47, %r10 +; AVX512F-ONLY-NEXT: orq %rcx, %r10 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $49, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $50, %r12 +; AVX512F-ONLY-NEXT: orq %rcx, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $51, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r9, %rcx +; AVX512F-ONLY-NEXT: shlq $52, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $53, %r9 +; AVX512F-ONLY-NEXT: orq %rcx, %r9 +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %r8, %rcx +; AVX512F-ONLY-NEXT: shlq $55, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $56, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %rdi, %rcx +; AVX512F-ONLY-NEXT: shlq $58, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $59, %rdi +; AVX512F-ONLY-NEXT: orq %rcx, %rdi +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $60, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: movq %rdx, %rcx +; AVX512F-ONLY-NEXT: shlq $61, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $62, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512F-ONLY-NEXT: shlq $63, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %rax, (%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r12 +; AVX512F-ONLY-NEXT: popq %r13 +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: popq %r15 +; AVX512F-ONLY-NEXT: popq %rbp +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor3_vf32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %r13 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovw (%rdi), %k1 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQ-NEXT: kshiftrw $10, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %ecx +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %edx +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %edi +; AVX512DQ-NEXT: kshiftrw $7, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r8d +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r9d +; AVX512DQ-NEXT: kshiftrw $5, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r10d +; AVX512DQ-NEXT: kshiftrw $4, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r11d +; AVX512DQ-NEXT: kshiftrw $3, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %ebx +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r14d +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r13d +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k2 +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movl %eax, %ebp +; AVX512DQ-NEXT: andl $1, %ebp +; AVX512DQ-NEXT: leal (%rbp,%rbp,2), %ebp +; AVX512DQ-NEXT: leal (%rbp,%r14,4), %ebp +; AVX512DQ-NEXT: leal (%rbp,%r14,8), %ebp +; AVX512DQ-NEXT: shll $4, %r14d +; AVX512DQ-NEXT: orl %ebp, %r14d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movl %r15d, %ebp +; AVX512DQ-NEXT: shll $5, %ebp +; AVX512DQ-NEXT: orl %r14d, %ebp +; AVX512DQ-NEXT: movl %r15d, %r14d +; AVX512DQ-NEXT: shll $6, %r14d +; AVX512DQ-NEXT: shll $7, %r15d +; AVX512DQ-NEXT: orl %r14d, %r15d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movl %r12d, %r14d +; AVX512DQ-NEXT: shll $8, %r14d +; AVX512DQ-NEXT: orl %r15d, %r14d +; AVX512DQ-NEXT: movl %r12d, %r15d +; AVX512DQ-NEXT: shll $9, %r15d +; AVX512DQ-NEXT: orl %r14d, %r15d +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k2 +; AVX512DQ-NEXT: shll $10, %r12d +; AVX512DQ-NEXT: orl %r15d, %r12d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movl %r13d, %r14d +; AVX512DQ-NEXT: shll $11, %r14d +; AVX512DQ-NEXT: orl %r12d, %r14d +; AVX512DQ-NEXT: movl %r13d, %r15d +; AVX512DQ-NEXT: shll $12, %r15d +; AVX512DQ-NEXT: orl %r14d, %r15d +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k2 +; AVX512DQ-NEXT: shll $13, %r13d +; AVX512DQ-NEXT: orl %r15d, %r13d +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movl %eax, %r14d +; AVX512DQ-NEXT: shll $14, %r14d +; AVX512DQ-NEXT: orl %r13d, %r14d +; AVX512DQ-NEXT: movl %eax, %r15d +; AVX512DQ-NEXT: shll $15, %r15d +; AVX512DQ-NEXT: orl %r14d, %r15d +; AVX512DQ-NEXT: kmovw %k2, %r14d +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k2 +; AVX512DQ-NEXT: shll $16, %eax +; AVX512DQ-NEXT: orl %r15d, %eax +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movl %r12d, %r15d +; AVX512DQ-NEXT: shll $17, %r15d +; AVX512DQ-NEXT: orl %eax, %r15d +; AVX512DQ-NEXT: movl %r12d, %eax +; AVX512DQ-NEXT: shll $18, %eax +; AVX512DQ-NEXT: orl %r15d, %eax +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k2 +; AVX512DQ-NEXT: shll $19, %r12d +; AVX512DQ-NEXT: orl %eax, %r12d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movl %r14d, %eax +; AVX512DQ-NEXT: shll $20, %eax +; AVX512DQ-NEXT: orl %r12d, %eax +; AVX512DQ-NEXT: movl %r14d, %r12d +; AVX512DQ-NEXT: shll $21, %r12d +; AVX512DQ-NEXT: orl %eax, %r12d +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k2 +; AVX512DQ-NEXT: shll $22, %r14d +; AVX512DQ-NEXT: orl %r12d, %r14d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movl %r15d, %r12d +; AVX512DQ-NEXT: shll $23, %r12d +; AVX512DQ-NEXT: orl %r14d, %r12d +; AVX512DQ-NEXT: movl %r15d, %r13d +; AVX512DQ-NEXT: shll $24, %r13d +; AVX512DQ-NEXT: orl %r12d, %r13d +; AVX512DQ-NEXT: kmovw %k2, %r14d +; AVX512DQ-NEXT: kshiftrw $2, %k1, %k2 +; AVX512DQ-NEXT: shll $25, %r15d +; AVX512DQ-NEXT: orl %r13d, %r15d +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movl %eax, %r12d +; AVX512DQ-NEXT: shll $26, %r12d +; AVX512DQ-NEXT: orl %r15d, %r12d +; AVX512DQ-NEXT: movl %eax, %r15d +; AVX512DQ-NEXT: shll $27, %r15d +; AVX512DQ-NEXT: orl %r12d, %r15d +; AVX512DQ-NEXT: shll $28, %eax +; AVX512DQ-NEXT: orl %r15d, %eax +; AVX512DQ-NEXT: movl %r14d, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movl %r13d, %r12d +; AVX512DQ-NEXT: shll $29, %r12d +; AVX512DQ-NEXT: orl %eax, %r12d +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k2 +; AVX512DQ-NEXT: shll $30, %r13d +; AVX512DQ-NEXT: orl %r12d, %r13d +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k2 +; AVX512DQ-NEXT: shll $31, %r14d +; AVX512DQ-NEXT: orl %r13d, %r14d +; AVX512DQ-NEXT: kmovw %k2, %r13d +; AVX512DQ-NEXT: kshiftrw $12, %k1, %k2 +; AVX512DQ-NEXT: orl %ebp, %r14d +; AVX512DQ-NEXT: movl %r14d, 8(%rsi) +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: movzbl %al, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leaq (%rax,%rax,2), %r14 +; AVX512DQ-NEXT: leaq (%r14,%rax,4), %rax +; AVX512DQ-NEXT: movzbl %r12b, %r14d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: leaq (%rax,%r14,8), %rax +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $4, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $5, %r14 +; AVX512DQ-NEXT: orq %r12, %r14 +; AVX512DQ-NEXT: movzbl %r15b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $6, %r15 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $7, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $13, %k1, %k2 +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %bl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rbx +; AVX512DQ-NEXT: shlq $9, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: kmovw %k2, %ebx +; AVX512DQ-NEXT: kshiftrw $14, %k1, %k2 +; AVX512DQ-NEXT: shlq $11, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %r11b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r11 +; AVX512DQ-NEXT: shlq $12, %r11 +; AVX512DQ-NEXT: orq %r12, %r11 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $13, %r12 +; AVX512DQ-NEXT: orq %r11, %r12 +; AVX512DQ-NEXT: kmovw %k2, %r11d +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %r10b, %ebp +; AVX512DQ-NEXT: andl $1, %ebp +; AVX512DQ-NEXT: movq %rbp, %r10 +; AVX512DQ-NEXT: shlq $15, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: kmovw %k1, %r10d +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k1 +; AVX512DQ-NEXT: shlq $17, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: movzbl %r9b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $18, %rax +; AVX512DQ-NEXT: orq %rbp, %rax +; AVX512DQ-NEXT: movq %r12, %rbp +; AVX512DQ-NEXT: shlq $19, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: kmovw %k1, %r9d +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512DQ-NEXT: shlq $20, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: movzbl %r8b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $21, %r8 +; AVX512DQ-NEXT: orq %r12, %r8 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $22, %r12 +; AVX512DQ-NEXT: orq %r8, %r12 +; AVX512DQ-NEXT: kmovw %k1, %r8d +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k1 +; AVX512DQ-NEXT: shlq $23, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %dil, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rdi +; AVX512DQ-NEXT: shlq $24, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $25, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: kmovw %k1, %edi +; AVX512DQ-NEXT: shlq $26, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %dl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $27, %rdx +; AVX512DQ-NEXT: orq %r12, %rdx +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $28, %r12 +; AVX512DQ-NEXT: orq %rdx, %r12 +; AVX512DQ-NEXT: shlq $29, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %cl, %ebp +; AVX512DQ-NEXT: andl $1, %ebp +; AVX512DQ-NEXT: movq %rbp, %rdx +; AVX512DQ-NEXT: shlq $30, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: kmovw %k0, %r12d +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 +; AVX512DQ-NEXT: shlq $31, %rbp +; AVX512DQ-NEXT: orq %rdx, %rbp +; AVX512DQ-NEXT: kmovw %k0, %edx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: shlq $32, %rcx +; AVX512DQ-NEXT: orq %rbp, %rcx +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $33, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $34, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $35, %r13 +; AVX512DQ-NEXT: orq %rcx, %r13 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r15, %rcx +; AVX512DQ-NEXT: shlq $37, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $38, %r15 +; AVX512DQ-NEXT: orq %rcx, %r15 +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $39, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %rbx, %rcx +; AVX512DQ-NEXT: shlq $40, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $41, %rbx +; AVX512DQ-NEXT: orq %rcx, %rbx +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: movq %r11, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r11, %rcx +; AVX512DQ-NEXT: shlq $43, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $44, %r11 +; AVX512DQ-NEXT: orq %rcx, %r11 +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $45, %rax +; AVX512DQ-NEXT: orq %r11, %rax +; AVX512DQ-NEXT: movq %r10, %rcx +; AVX512DQ-NEXT: shlq $46, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $47, %r10 +; AVX512DQ-NEXT: orq %rcx, %r10 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $49, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $50, %r12 +; AVX512DQ-NEXT: orq %rcx, %r12 +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %rax +; AVX512DQ-NEXT: shlq $51, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r9, %rcx +; AVX512DQ-NEXT: shlq $52, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $53, %r9 +; AVX512DQ-NEXT: orq %rcx, %r9 +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %r8, %rcx +; AVX512DQ-NEXT: shlq $55, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $56, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %rdi, %rcx +; AVX512DQ-NEXT: shlq $58, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $59, %rdi +; AVX512DQ-NEXT: orq %rcx, %rdi +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movq %rdx, %rax +; AVX512DQ-NEXT: shlq $60, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: movq %rdx, %rcx +; AVX512DQ-NEXT: shlq $61, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $62, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512DQ-NEXT: shlq $63, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %rax, (%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r13 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor3_vf32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r13 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: movq %rsi, %rdx +; AVX512BW-NEXT: kmovd (%rdi), %k0 +; AVX512BW-NEXT: kshiftrd $21, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %esi +; AVX512BW-NEXT: kshiftrd $10, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kshiftrd $9, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $8, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrd $7, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrd $6, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrd $5, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrd $4, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrd $3, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrd $22, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrd $23, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $25, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrd $26, %k0, %k1 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movl %esi, %ebp +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: leal (%rbp,%rbp,2), %ebp +; AVX512BW-NEXT: leal (%rbp,%r14,4), %ebp +; AVX512BW-NEXT: leal (%rbp,%r14,8), %ebp +; AVX512BW-NEXT: shll $4, %r14d +; AVX512BW-NEXT: orl %ebp, %r14d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movl %r15d, %ebp +; AVX512BW-NEXT: shll $5, %ebp +; AVX512BW-NEXT: orl %r14d, %ebp +; AVX512BW-NEXT: movl %r15d, %r14d +; AVX512BW-NEXT: shll $6, %r14d +; AVX512BW-NEXT: shll $7, %r15d +; AVX512BW-NEXT: orl %r14d, %r15d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movl %r12d, %r14d +; AVX512BW-NEXT: shll $8, %r14d +; AVX512BW-NEXT: orl %r15d, %r14d +; AVX512BW-NEXT: movl %r12d, %r15d +; AVX512BW-NEXT: shll $9, %r15d +; AVX512BW-NEXT: orl %r14d, %r15d +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrd $27, %k0, %k1 +; AVX512BW-NEXT: shll $10, %r12d +; AVX512BW-NEXT: orl %r15d, %r12d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movl %r13d, %r14d +; AVX512BW-NEXT: shll $11, %r14d +; AVX512BW-NEXT: orl %r12d, %r14d +; AVX512BW-NEXT: movl %r13d, %r15d +; AVX512BW-NEXT: shll $12, %r15d +; AVX512BW-NEXT: orl %r14d, %r15d +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 +; AVX512BW-NEXT: shll $13, %r13d +; AVX512BW-NEXT: orl %r15d, %r13d +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movl %eax, %r14d +; AVX512BW-NEXT: shll $14, %r14d +; AVX512BW-NEXT: orl %r13d, %r14d +; AVX512BW-NEXT: movl %eax, %r15d +; AVX512BW-NEXT: shll $15, %r15d +; AVX512BW-NEXT: orl %r14d, %r15d +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrd $29, %k0, %k1 +; AVX512BW-NEXT: shll $16, %eax +; AVX512BW-NEXT: orl %r15d, %eax +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movl %r12d, %r15d +; AVX512BW-NEXT: shll $17, %r15d +; AVX512BW-NEXT: orl %eax, %r15d +; AVX512BW-NEXT: movl %r12d, %eax +; AVX512BW-NEXT: shll $18, %eax +; AVX512BW-NEXT: orl %r15d, %eax +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $30, %k0, %k1 +; AVX512BW-NEXT: shll $19, %r12d +; AVX512BW-NEXT: orl %eax, %r12d +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movl %r14d, %eax +; AVX512BW-NEXT: shll $20, %eax +; AVX512BW-NEXT: orl %r12d, %eax +; AVX512BW-NEXT: movl %r14d, %r12d +; AVX512BW-NEXT: shll $21, %r12d +; AVX512BW-NEXT: orl %eax, %r12d +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrd $31, %k0, %k1 +; AVX512BW-NEXT: shll $22, %r14d +; AVX512BW-NEXT: orl %r12d, %r14d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movl %r15d, %r12d +; AVX512BW-NEXT: shll $23, %r12d +; AVX512BW-NEXT: orl %r14d, %r12d +; AVX512BW-NEXT: movl %r15d, %r13d +; AVX512BW-NEXT: shll $24, %r13d +; AVX512BW-NEXT: orl %r12d, %r13d +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrd $2, %k0, %k1 +; AVX512BW-NEXT: shll $25, %r15d +; AVX512BW-NEXT: orl %r13d, %r15d +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movl %eax, %r12d +; AVX512BW-NEXT: shll $26, %r12d +; AVX512BW-NEXT: orl %r15d, %r12d +; AVX512BW-NEXT: movl %eax, %r15d +; AVX512BW-NEXT: shll $27, %r15d +; AVX512BW-NEXT: orl %r12d, %r15d +; AVX512BW-NEXT: shll $28, %eax +; AVX512BW-NEXT: orl %r15d, %eax +; AVX512BW-NEXT: movl %r14d, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movl %r13d, %r12d +; AVX512BW-NEXT: shll $29, %r12d +; AVX512BW-NEXT: orl %eax, %r12d +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $1, %k0, %k1 +; AVX512BW-NEXT: shll $30, %r13d +; AVX512BW-NEXT: orl %r12d, %r13d +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $11, %k0, %k1 +; AVX512BW-NEXT: shll $31, %r14d +; AVX512BW-NEXT: orl %r13d, %r14d +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrd $12, %k0, %k1 +; AVX512BW-NEXT: orl %ebp, %r14d +; AVX512BW-NEXT: movq %rdx, %rbp +; AVX512BW-NEXT: movl %r14d, 8(%rdx) +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: movzbl %al, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %r14 +; AVX512BW-NEXT: leaq (%r14,%rax,4), %rax +; AVX512BW-NEXT: movzbl %r12b, %r14d +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: leaq (%rax,%r14,8), %rax +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $4, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: shlq $5, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: movzbl %r15b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: shlq $6, %r15 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $13, %k0, %k1 +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %bl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rbx +; AVX512BW-NEXT: shlq $9, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrd $14, %k0, %k1 +; AVX512BW-NEXT: shlq $11, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %r11b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: shlq $12, %r11 +; AVX512BW-NEXT: orq %r12, %r11 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $13, %r12 +; AVX512BW-NEXT: orq %r11, %r12 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrd $15, %k0, %k1 +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %r10b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $15, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 +; AVX512BW-NEXT: shlq $17, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %r9b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $18, %r9 +; AVX512BW-NEXT: orq %r12, %r9 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $19, %r12 +; AVX512BW-NEXT: orq %r9, %r12 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrd $17, %k0, %k1 +; AVX512BW-NEXT: shlq $20, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %r8b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r8 +; AVX512BW-NEXT: shlq $21, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $22, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrd $18, %k0, %k1 +; AVX512BW-NEXT: shlq $23, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %dil, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $24, %rdi +; AVX512BW-NEXT: orq %r12, %rdi +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $25, %r12 +; AVX512BW-NEXT: orq %rdi, %r12 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrd $19, %k0, %k1 +; AVX512BW-NEXT: shlq $26, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %r12 +; AVX512BW-NEXT: shlq $27, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $28, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: shlq $29, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movzbl %cl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $30, %r12 +; AVX512BW-NEXT: orq %rdx, %r12 +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: kshiftrd $20, %k0, %k0 +; AVX512BW-NEXT: shlq $31, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k0, %r12d +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: shlq $32, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $33, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r13, %rcx +; AVX512BW-NEXT: shlq $34, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $35, %r13 +; AVX512BW-NEXT: orq %rcx, %r13 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %rcx +; AVX512BW-NEXT: shlq $37, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $38, %r15 +; AVX512BW-NEXT: orq %rcx, %r15 +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $39, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %rbx, %rcx +; AVX512BW-NEXT: shlq $40, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $41, %rbx +; AVX512BW-NEXT: orq %rcx, %rbx +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r11, %rcx +; AVX512BW-NEXT: shlq $43, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $44, %r11 +; AVX512BW-NEXT: orq %rcx, %r11 +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $45, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r10, %rcx +; AVX512BW-NEXT: shlq $46, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $47, %r10 +; AVX512BW-NEXT: orq %rcx, %r10 +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movq %r9, %rcx +; AVX512BW-NEXT: shlq $49, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $50, %r9 +; AVX512BW-NEXT: orq %rcx, %r9 +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $51, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r8, %rcx +; AVX512BW-NEXT: shlq $52, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $53, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rdi, %rcx +; AVX512BW-NEXT: shlq $55, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $56, %rdi +; AVX512BW-NEXT: orq %rcx, %rdi +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $57, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: shlq $58, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $59, %rdx +; AVX512BW-NEXT: orq %rcx, %rdx +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $61, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $62, %r12 +; AVX512BW-NEXT: orq %rcx, %r12 +; AVX512BW-NEXT: movzbl %sil, %eax +; AVX512BW-NEXT: shlq $63, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %rax, (%rbp) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r13 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: retq + %src.vec = load <32 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <96 x i32> + store <96 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor3_vf64(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor3_vf64: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbp +; AVX512F-ONLY-NEXT: pushq %r15 +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %r13 +; AVX512F-ONLY-NEXT: pushq %r12 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k0 +; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k2 +; AVX512F-ONLY-NEXT: kshiftrw $7, %k0, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-ONLY-NEXT: kshiftrw $15, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $14, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %edx +; AVX512F-ONLY-NEXT: kshiftrw $13, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %edi +; AVX512F-ONLY-NEXT: kshiftrw $12, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $15, %k2, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k0, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $12, %k0, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $3, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k3, %r12d +; AVX512F-ONLY-NEXT: movzbl %r12b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: leaq (%r12,%r12,2), %r13 +; AVX512F-ONLY-NEXT: leaq (%r13,%r12,4), %r12 +; AVX512F-ONLY-NEXT: movzbl %bl, %ebx +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: leaq (%r12,%rbx,8), %r12 +; AVX512F-ONLY-NEXT: movq %rbx, %r13 +; AVX512F-ONLY-NEXT: shlq $4, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $5, %rbx +; AVX512F-ONLY-NEXT: orq %r13, %rbx +; AVX512F-ONLY-NEXT: movzbl %r10b, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $6, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %r13 +; AVX512F-ONLY-NEXT: shlq $7, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $8, %r10 +; AVX512F-ONLY-NEXT: orq %r13, %r10 +; AVX512F-ONLY-NEXT: movzbl %r12b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $9, %r13 +; AVX512F-ONLY-NEXT: orq %r10, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $10, %r10 +; AVX512F-ONLY-NEXT: orq %r13, %r10 +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $11, %r12 +; AVX512F-ONLY-NEXT: orq %r10, %r12 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %r13 +; AVX512F-ONLY-NEXT: shlq $12, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $13, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $14, %r10 +; AVX512F-ONLY-NEXT: orq %r12, %r10 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $15, %r13 +; AVX512F-ONLY-NEXT: orq %r10, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $16, %r10 +; AVX512F-ONLY-NEXT: orq %r13, %r10 +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $7, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $17, %r12 +; AVX512F-ONLY-NEXT: orq %r10, %r12 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %r13 +; AVX512F-ONLY-NEXT: shlq $18, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $19, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $20, %r10 +; AVX512F-ONLY-NEXT: orq %r12, %r10 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $21, %r13 +; AVX512F-ONLY-NEXT: orq %r10, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $22, %r10 +; AVX512F-ONLY-NEXT: orq %r13, %r10 +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $23, %r12 +; AVX512F-ONLY-NEXT: orq %r10, %r12 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %r13 +; AVX512F-ONLY-NEXT: shlq $24, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $25, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k0, %k4 +; AVX512F-ONLY-NEXT: shlq $26, %r10 +; AVX512F-ONLY-NEXT: orq %r12, %r10 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $27, %r13 +; AVX512F-ONLY-NEXT: orq %r10, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $28, %r10 +; AVX512F-ONLY-NEXT: orq %r13, %r10 +; AVX512F-ONLY-NEXT: shlq $29, %r12 +; AVX512F-ONLY-NEXT: orq %r10, %r12 +; AVX512F-ONLY-NEXT: movzbl %r15b, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $30, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k4, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $31, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $12, %k3, %k4 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: shlq $32, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $33, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $34, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $35, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $36, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $37, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $38, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $39, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $40, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $15, %k3, %k3 +; AVX512F-ONLY-NEXT: shlq $41, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $42, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $43, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: kmovw %k3, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k1, %k3 +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $45, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $47, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k1, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $48, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $49, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k1, %k3 +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $51, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k1, %k3 +; AVX512F-ONLY-NEXT: shlq $53, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $55, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $4, %k1, %k3 +; AVX512F-ONLY-NEXT: shlq $56, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $57, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $58, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k1, %k3 +; AVX512F-ONLY-NEXT: shlq $59, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $60, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $61, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $62, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movzbl %r15b, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $63, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k0, %k3 +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %rax, (%rsi) +; AVX512F-ONLY-NEXT: movzbl %r12b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movzbl %r10b, %ebx +; AVX512F-ONLY-NEXT: movl %ebx, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: leaq (%r12,%rax,2), %r12 +; AVX512F-ONLY-NEXT: leaq (%r12,%rax,4), %r12 +; AVX512F-ONLY-NEXT: leaq (%r12,%rax,8), %rax +; AVX512F-ONLY-NEXT: movzbl %r14b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $4, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $5, %r14 +; AVX512F-ONLY-NEXT: orq %r13, %r14 +; AVX512F-ONLY-NEXT: shlq $6, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $7, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $8, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $15, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $9, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $10, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $11, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $1, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $12, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $13, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $14, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $15, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $16, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $17, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $2, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $18, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $19, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $20, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $3, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $21, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $22, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $23, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $4, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $24, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $25, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $26, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $5, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $27, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $28, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $29, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $30, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movzbl %r12b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $32, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $7, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $33, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $34, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $35, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $37, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $39, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $41, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $10, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $42, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $43, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $44, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $45, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $46, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $47, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $48, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $49, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $50, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $13, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $51, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $52, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $53, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k2, %k2 +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $55, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $56, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $57, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $58, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $59, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $60, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movl %r11d, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $61, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $9, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $62, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k1, %k2 +; AVX512F-ONLY-NEXT: movzbl %r11b, %r11d +; AVX512F-ONLY-NEXT: shlq $63, %r11 +; AVX512F-ONLY-NEXT: orq %rax, %r11 +; AVX512F-ONLY-NEXT: kmovw %k2, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $7, %k1, %k2 +; AVX512F-ONLY-NEXT: orq %r14, %r11 +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: kshiftrw $6, %k1, %k1 +; AVX512F-ONLY-NEXT: movq %r11, 16(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k1, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k1 +; AVX512F-ONLY-NEXT: movzbl %r11b, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: leaq (%r15,%r15,2), %r11 +; AVX512F-ONLY-NEXT: leaq (%r11,%r14,4), %r11 +; AVX512F-ONLY-NEXT: leaq (%r11,%r14,8), %r11 +; AVX512F-ONLY-NEXT: shlq $4, %r14 +; AVX512F-ONLY-NEXT: orq %r11, %r14 +; AVX512F-ONLY-NEXT: movzbl %al, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r11 +; AVX512F-ONLY-NEXT: shlq $5, %r11 +; AVX512F-ONLY-NEXT: orq %r14, %r11 +; AVX512F-ONLY-NEXT: movq %rax, %r14 +; AVX512F-ONLY-NEXT: shlq $6, %r14 +; AVX512F-ONLY-NEXT: shlq $7, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movzbl %r13b, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %r13 +; AVX512F-ONLY-NEXT: shlq $8, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r14, %r15 +; AVX512F-ONLY-NEXT: shlq $9, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: kmovw %k1, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $10, %r14 +; AVX512F-ONLY-NEXT: orq %r15, %r14 +; AVX512F-ONLY-NEXT: movzbl %r12b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $11, %r15 +; AVX512F-ONLY-NEXT: orq %r14, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $12, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: kmovw %k1, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $13, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $14, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $15, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $16, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %r9b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $17, %r9 +; AVX512F-ONLY-NEXT: orq %r12, %r9 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $18, %r12 +; AVX512F-ONLY-NEXT: orq %r9, %r12 +; AVX512F-ONLY-NEXT: kmovw %k1, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $19, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %r8b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r8 +; AVX512F-ONLY-NEXT: shlq $20, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $21, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $22, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %dil, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $23, %rdi +; AVX512F-ONLY-NEXT: orq %r12, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $24, %r12 +; AVX512F-ONLY-NEXT: orq %rdi, %r12 +; AVX512F-ONLY-NEXT: kmovw %k1, %edi +; AVX512F-ONLY-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $25, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %dl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rdx +; AVX512F-ONLY-NEXT: shlq $26, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $27, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %edx +; AVX512F-ONLY-NEXT: shlq $28, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %cl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $29, %rcx +; AVX512F-ONLY-NEXT: orq %r12, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $30, %r12 +; AVX512F-ONLY-NEXT: orq %rcx, %r12 +; AVX512F-ONLY-NEXT: kmovw %k0, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $9, %k0, %k0 +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: andl $1, %ebp +; AVX512F-ONLY-NEXT: movq %rbp, %rcx +; AVX512F-ONLY-NEXT: shlq $32, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: kmovw %k0, %ecx +; AVX512F-ONLY-NEXT: shlq $34, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rax +; AVX512F-ONLY-NEXT: orq %rbp, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $36, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $37, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $39, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $40, %r14 +; AVX512F-ONLY-NEXT: orq %r12, %r14 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $41, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r14 +; AVX512F-ONLY-NEXT: shlq $42, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: shlq $43, %r15 +; AVX512F-ONLY-NEXT: orq %r14, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r9, %r14 +; AVX512F-ONLY-NEXT: shlq $45, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: shlq $46, %r9 +; AVX512F-ONLY-NEXT: orq %r14, %r9 +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $47, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $48, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $49, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $51, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $52, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $53, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $54, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $56, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $58, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $59, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $60, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $61, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: shlq $62, %r10 +; AVX512F-ONLY-NEXT: orq %rcx, %r10 +; AVX512F-ONLY-NEXT: shlq $63, %rbx +; AVX512F-ONLY-NEXT: orq %r10, %rbx +; AVX512F-ONLY-NEXT: orq %r11, %rbx +; AVX512F-ONLY-NEXT: movq %rbx, 8(%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r12 +; AVX512F-ONLY-NEXT: popq %r13 +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: popq %r15 +; AVX512F-ONLY-NEXT: popq %rbp +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor3_vf64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %r13 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovw (%rdi), %k3 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 +; AVX512DQ-NEXT: kmovw 6(%rdi), %k2 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k4 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %ecx +; AVX512DQ-NEXT: kshiftrw $14, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %edx +; AVX512DQ-NEXT: kshiftrw $13, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %edi +; AVX512DQ-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r8d +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r9d +; AVX512DQ-NEXT: kshiftrw $15, %k2, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r11d +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k4 +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r14d +; AVX512DQ-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $2, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r10d +; AVX512DQ-NEXT: kshiftrw $1, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k4, %ebx +; AVX512DQ-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k3, %r12d +; AVX512DQ-NEXT: movzbl %r12b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: leaq (%r12,%r12,2), %r13 +; AVX512DQ-NEXT: leaq (%r13,%r12,4), %r12 +; AVX512DQ-NEXT: movzbl %bl, %ebx +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: leaq (%r12,%rbx,8), %r12 +; AVX512DQ-NEXT: movq %rbx, %r13 +; AVX512DQ-NEXT: shlq $4, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: shlq $5, %rbx +; AVX512DQ-NEXT: orq %r13, %rbx +; AVX512DQ-NEXT: movzbl %r10b, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $6, %r12 +; AVX512DQ-NEXT: movq %r10, %r13 +; AVX512DQ-NEXT: shlq $7, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQ-NEXT: shlq $8, %r10 +; AVX512DQ-NEXT: orq %r13, %r10 +; AVX512DQ-NEXT: movzbl %r12b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $9, %r13 +; AVX512DQ-NEXT: orq %r10, %r13 +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $10, %r10 +; AVX512DQ-NEXT: orq %r13, %r10 +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQ-NEXT: shlq $11, %r12 +; AVX512DQ-NEXT: orq %r10, %r12 +; AVX512DQ-NEXT: movzbl %r13b, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %r13 +; AVX512DQ-NEXT: shlq $12, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $13, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQ-NEXT: shlq $14, %r10 +; AVX512DQ-NEXT: orq %r12, %r10 +; AVX512DQ-NEXT: movzbl %r13b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $15, %r13 +; AVX512DQ-NEXT: orq %r10, %r13 +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $16, %r10 +; AVX512DQ-NEXT: orq %r13, %r10 +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQ-NEXT: shlq $17, %r12 +; AVX512DQ-NEXT: orq %r10, %r12 +; AVX512DQ-NEXT: movzbl %r13b, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %r13 +; AVX512DQ-NEXT: shlq $18, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $19, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQ-NEXT: shlq $20, %r10 +; AVX512DQ-NEXT: orq %r12, %r10 +; AVX512DQ-NEXT: movzbl %r13b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $21, %r13 +; AVX512DQ-NEXT: orq %r10, %r13 +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $22, %r10 +; AVX512DQ-NEXT: orq %r13, %r10 +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQ-NEXT: shlq $23, %r12 +; AVX512DQ-NEXT: orq %r10, %r12 +; AVX512DQ-NEXT: movzbl %r13b, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %r13 +; AVX512DQ-NEXT: shlq $24, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $25, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k4 +; AVX512DQ-NEXT: shlq $26, %r10 +; AVX512DQ-NEXT: orq %r12, %r10 +; AVX512DQ-NEXT: movzbl %r13b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $27, %r13 +; AVX512DQ-NEXT: orq %r10, %r13 +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $28, %r10 +; AVX512DQ-NEXT: orq %r13, %r10 +; AVX512DQ-NEXT: shlq $29, %r12 +; AVX512DQ-NEXT: orq %r10, %r12 +; AVX512DQ-NEXT: movzbl %r15b, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $30, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k4, %r10d +; AVX512DQ-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQ-NEXT: shlq $31, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: shlq $32, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $33, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $34, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQ-NEXT: shlq $35, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $36, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $37, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQ-NEXT: shlq $38, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $39, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $40, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $15, %k3, %k3 +; AVX512DQ-NEXT: shlq $41, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $42, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $43, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: kmovw %k3, %r12d +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k3 +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $45, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: shlq $47, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $48, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $49, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: kmovw %k3, %r15d +; AVX512DQ-NEXT: kshiftrw $2, %k1, %k3 +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $51, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k3, %r12d +; AVX512DQ-NEXT: kshiftrw $3, %k1, %k3 +; AVX512DQ-NEXT: shlq $53, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $55, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $4, %k1, %k3 +; AVX512DQ-NEXT: shlq $56, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $57, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $58, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: kmovw %k3, %r13d +; AVX512DQ-NEXT: kshiftrw $5, %k1, %k3 +; AVX512DQ-NEXT: shlq $59, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $60, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $61, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: kmovw %k3, %r15d +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k3 +; AVX512DQ-NEXT: shlq $62, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movzbl %r15b, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $63, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k3, %r12d +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k3 +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %rax, (%rsi) +; AVX512DQ-NEXT: movzbl %r12b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movzbl %r10b, %ebx +; AVX512DQ-NEXT: movl %ebx, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: leaq (%r12,%rax,2), %r12 +; AVX512DQ-NEXT: leaq (%r12,%rax,4), %r12 +; AVX512DQ-NEXT: leaq (%r12,%rax,8), %rax +; AVX512DQ-NEXT: movzbl %r14b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $4, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $5, %r14 +; AVX512DQ-NEXT: orq %r13, %r14 +; AVX512DQ-NEXT: shlq $6, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $7, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $8, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k3 +; AVX512DQ-NEXT: shlq $9, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $10, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $11, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $1, %k2, %k3 +; AVX512DQ-NEXT: shlq $12, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $13, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $14, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: shlq $15, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $16, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $17, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $2, %k2, %k3 +; AVX512DQ-NEXT: shlq $18, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $19, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $20, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $3, %k2, %k3 +; AVX512DQ-NEXT: shlq $21, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $22, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $23, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $4, %k2, %k3 +; AVX512DQ-NEXT: shlq $24, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $25, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $26, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $5, %k2, %k3 +; AVX512DQ-NEXT: shlq $27, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $28, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $29, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k3, %r12d +; AVX512DQ-NEXT: kshiftrw $6, %k2, %k3 +; AVX512DQ-NEXT: shlq $30, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movzbl %r12b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $32, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $7, %k2, %k3 +; AVX512DQ-NEXT: shlq $33, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $34, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $35, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k3, %r13d +; AVX512DQ-NEXT: kshiftrw $8, %k2, %k3 +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $37, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k3, %r12d +; AVX512DQ-NEXT: kshiftrw $9, %k2, %k3 +; AVX512DQ-NEXT: shlq $39, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $41, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $10, %k2, %k3 +; AVX512DQ-NEXT: shlq $42, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $43, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $44, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k3, %r13d +; AVX512DQ-NEXT: kshiftrw $11, %k2, %k3 +; AVX512DQ-NEXT: shlq $45, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $46, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $47, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k3, %r12d +; AVX512DQ-NEXT: kshiftrw $12, %k2, %k3 +; AVX512DQ-NEXT: shlq $48, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $49, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $50, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $13, %k2, %k3 +; AVX512DQ-NEXT: shlq $51, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $52, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $53, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k3, %r13d +; AVX512DQ-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $55, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $56, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $10, %k1, %k2 +; AVX512DQ-NEXT: shlq $57, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $58, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $59, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: shlq $60, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movl %r11d, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $61, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k2 +; AVX512DQ-NEXT: shlq $62, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 +; AVX512DQ-NEXT: movzbl %r11b, %r11d +; AVX512DQ-NEXT: shlq $63, %r11 +; AVX512DQ-NEXT: orq %rax, %r11 +; AVX512DQ-NEXT: kmovw %k2, %r13d +; AVX512DQ-NEXT: kshiftrw $7, %k1, %k2 +; AVX512DQ-NEXT: orq %r14, %r11 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k1 +; AVX512DQ-NEXT: movq %r11, 16(%rsi) +; AVX512DQ-NEXT: kmovw %k1, %r11d +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k1 +; AVX512DQ-NEXT: movzbl %r11b, %r14d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: leaq (%r15,%r15,2), %r11 +; AVX512DQ-NEXT: leaq (%r11,%r14,4), %r11 +; AVX512DQ-NEXT: leaq (%r11,%r14,8), %r11 +; AVX512DQ-NEXT: shlq $4, %r14 +; AVX512DQ-NEXT: orq %r11, %r14 +; AVX512DQ-NEXT: movzbl %al, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r11 +; AVX512DQ-NEXT: shlq $5, %r11 +; AVX512DQ-NEXT: orq %r14, %r11 +; AVX512DQ-NEXT: movq %rax, %r14 +; AVX512DQ-NEXT: shlq $6, %r14 +; AVX512DQ-NEXT: shlq $7, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movzbl %r13b, %r14d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %r13 +; AVX512DQ-NEXT: shlq $8, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r14, %r15 +; AVX512DQ-NEXT: shlq $9, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: kmovw %k1, %r13d +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512DQ-NEXT: shlq $10, %r14 +; AVX512DQ-NEXT: orq %r15, %r14 +; AVX512DQ-NEXT: movzbl %r12b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $11, %r15 +; AVX512DQ-NEXT: orq %r14, %r15 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $12, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: kmovw %k1, %r14d +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k1 +; AVX512DQ-NEXT: shlq $13, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $14, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $15, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: kmovw %k1, %r15d +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512DQ-NEXT: shlq $16, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %r9b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $17, %r9 +; AVX512DQ-NEXT: orq %r12, %r9 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $18, %r12 +; AVX512DQ-NEXT: orq %r9, %r12 +; AVX512DQ-NEXT: kmovw %k1, %r9d +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k1 +; AVX512DQ-NEXT: shlq $19, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %r8b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r8 +; AVX512DQ-NEXT: shlq $20, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $21, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: kmovw %k1, %r8d +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512DQ-NEXT: shlq $22, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %dil, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $23, %rdi +; AVX512DQ-NEXT: orq %r12, %rdi +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $24, %r12 +; AVX512DQ-NEXT: orq %rdi, %r12 +; AVX512DQ-NEXT: kmovw %k1, %edi +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512DQ-NEXT: shlq $25, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %dl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rdx +; AVX512DQ-NEXT: shlq $26, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $27, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: kmovw %k1, %edx +; AVX512DQ-NEXT: shlq $28, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %cl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $29, %rcx +; AVX512DQ-NEXT: orq %r12, %rcx +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $30, %r12 +; AVX512DQ-NEXT: orq %rcx, %r12 +; AVX512DQ-NEXT: kmovw %k0, %ebp +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: andl $1, %ebp +; AVX512DQ-NEXT: movq %rbp, %rcx +; AVX512DQ-NEXT: shlq $32, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $33, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: shlq $34, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $35, %rax +; AVX512DQ-NEXT: orq %rbp, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $36, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $37, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $39, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $40, %r14 +; AVX512DQ-NEXT: orq %r12, %r14 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $41, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r15, %r14 +; AVX512DQ-NEXT: shlq $42, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: shlq $43, %r15 +; AVX512DQ-NEXT: orq %r14, %r15 +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r9, %r14 +; AVX512DQ-NEXT: shlq $45, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: shlq $46, %r9 +; AVX512DQ-NEXT: orq %r14, %r9 +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $47, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $48, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: shlq $49, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $51, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: shlq $52, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $53, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $54, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $56, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: movq %rdx, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: shlq $58, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shlq $59, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $60, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: shlq $61, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: shlq $62, %r10 +; AVX512DQ-NEXT: orq %rcx, %r10 +; AVX512DQ-NEXT: shlq $63, %rbx +; AVX512DQ-NEXT: orq %r10, %rbx +; AVX512DQ-NEXT: orq %r11, %rbx +; AVX512DQ-NEXT: movq %rbx, 8(%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r13 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor3_vf64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r13 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: kshiftrq $31, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrq $30, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kshiftrq $29, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: kshiftrq $28, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrq $27, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrq $26, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrq $63, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrq $45, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $44, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $10, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrq $2, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrq $1, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrq $3, %k0, %k1 +; AVX512BW-NEXT: kmovd %k0, %r12d +; AVX512BW-NEXT: movzbl %r12b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: leaq (%r12,%r12,2), %r13 +; AVX512BW-NEXT: leaq (%r13,%r12,4), %r12 +; AVX512BW-NEXT: movzbl %bl, %ebx +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: leaq (%r12,%rbx,8), %r12 +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $4, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: shlq $5, %rbx +; AVX512BW-NEXT: orq %r13, %rbx +; AVX512BW-NEXT: movzbl %r10b, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $6, %r12 +; AVX512BW-NEXT: movq %r10, %r13 +; AVX512BW-NEXT: shlq $7, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $4, %k0, %k1 +; AVX512BW-NEXT: shlq $8, %r10 +; AVX512BW-NEXT: orq %r13, %r10 +; AVX512BW-NEXT: movzbl %r12b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $9, %r13 +; AVX512BW-NEXT: orq %r10, %r13 +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $10, %r10 +; AVX512BW-NEXT: orq %r13, %r10 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $5, %k0, %k1 +; AVX512BW-NEXT: shlq $11, %r12 +; AVX512BW-NEXT: orq %r10, %r12 +; AVX512BW-NEXT: movzbl %r13b, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %r13 +; AVX512BW-NEXT: shlq $12, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $13, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $6, %k0, %k1 +; AVX512BW-NEXT: shlq $14, %r10 +; AVX512BW-NEXT: orq %r12, %r10 +; AVX512BW-NEXT: movzbl %r13b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $15, %r13 +; AVX512BW-NEXT: orq %r10, %r13 +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $16, %r10 +; AVX512BW-NEXT: orq %r13, %r10 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $7, %k0, %k1 +; AVX512BW-NEXT: shlq $17, %r12 +; AVX512BW-NEXT: orq %r10, %r12 +; AVX512BW-NEXT: movzbl %r13b, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %r13 +; AVX512BW-NEXT: shlq $18, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $19, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $8, %k0, %k1 +; AVX512BW-NEXT: shlq $20, %r10 +; AVX512BW-NEXT: orq %r12, %r10 +; AVX512BW-NEXT: movzbl %r13b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $21, %r13 +; AVX512BW-NEXT: orq %r10, %r13 +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $22, %r10 +; AVX512BW-NEXT: orq %r13, %r10 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $9, %k0, %k1 +; AVX512BW-NEXT: shlq $23, %r12 +; AVX512BW-NEXT: orq %r10, %r12 +; AVX512BW-NEXT: movzbl %r13b, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %r13 +; AVX512BW-NEXT: shlq $24, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $25, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $42, %k0, %k1 +; AVX512BW-NEXT: shlq $26, %r10 +; AVX512BW-NEXT: orq %r12, %r10 +; AVX512BW-NEXT: movzbl %r13b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $27, %r13 +; AVX512BW-NEXT: orq %r10, %r13 +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $28, %r10 +; AVX512BW-NEXT: orq %r13, %r10 +; AVX512BW-NEXT: shlq $29, %r12 +; AVX512BW-NEXT: orq %r10, %r12 +; AVX512BW-NEXT: movzbl %r14b, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrq $11, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $12, %k0, %k1 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: shlq $32, %r14 +; AVX512BW-NEXT: orq %r13, %r14 +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $33, %r12 +; AVX512BW-NEXT: orq %r14, %r12 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: shlq $34, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $13, %k0, %k1 +; AVX512BW-NEXT: shlq $35, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $36, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $37, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrq $14, %k0, %k1 +; AVX512BW-NEXT: shlq $38, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $39, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $40, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $15, %k0, %k1 +; AVX512BW-NEXT: shlq $41, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $42, %r12 +; AVX512BW-NEXT: orq %r14, %r12 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: shlq $43, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $16, %k0, %k1 +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $45, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrq $17, %k0, %k1 +; AVX512BW-NEXT: shlq $47, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $49, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $18, %k0, %k1 +; AVX512BW-NEXT: shlq $50, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $51, %r12 +; AVX512BW-NEXT: orq %r14, %r12 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: shlq $52, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $19, %k0, %k1 +; AVX512BW-NEXT: shlq $53, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $54, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $55, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrq $20, %k0, %k1 +; AVX512BW-NEXT: shlq $56, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $57, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $58, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $21, %k0, %k1 +; AVX512BW-NEXT: shlq $59, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $60, %r12 +; AVX512BW-NEXT: orq %r14, %r12 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: shlq $61, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $43, %k0, %k1 +; AVX512BW-NEXT: shlq $62, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movzbl %r12b, %r14d +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $63, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $46, %k0, %k1 +; AVX512BW-NEXT: orq %rbx, %r12 +; AVX512BW-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: movq %r12, (%rsi) +; AVX512BW-NEXT: movzbl %al, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movzbl %r10b, %ebx +; AVX512BW-NEXT: movl %ebx, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: leaq (%r12,%rax,2), %r12 +; AVX512BW-NEXT: leaq (%r12,%rax,4), %r12 +; AVX512BW-NEXT: leaq (%r12,%rax,8), %rax +; AVX512BW-NEXT: movzbl %r15b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $4, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $5, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: shlq $6, %r12 +; AVX512BW-NEXT: movzbl %bpl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $7, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $8, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $47, %k0, %k1 +; AVX512BW-NEXT: shlq $9, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $10, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $11, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $48, %k0, %k1 +; AVX512BW-NEXT: shlq $12, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %bpl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $13, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $14, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $49, %k0, %k1 +; AVX512BW-NEXT: shlq $15, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $16, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $17, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $50, %k0, %k1 +; AVX512BW-NEXT: shlq $18, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %bpl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $19, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $20, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $51, %k0, %k1 +; AVX512BW-NEXT: shlq $21, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $22, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $23, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $52, %k0, %k1 +; AVX512BW-NEXT: shlq $24, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %bpl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $25, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $26, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $53, %k0, %k1 +; AVX512BW-NEXT: shlq $27, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $28, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $29, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $54, %k0, %k1 +; AVX512BW-NEXT: shlq $30, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %r12b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: shlq $31, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $32, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $55, %k0, %k1 +; AVX512BW-NEXT: shlq $33, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $34, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $35, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $37, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $57, %k0, %k1 +; AVX512BW-NEXT: shlq $39, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $41, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $58, %k0, %k1 +; AVX512BW-NEXT: shlq $42, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $43, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $44, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $59, %k0, %k1 +; AVX512BW-NEXT: shlq $45, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $46, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $47, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 +; AVX512BW-NEXT: shlq $48, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $49, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $50, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $61, %k0, %k1 +; AVX512BW-NEXT: shlq $51, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $52, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $53, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $62, %k0, %k1 +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $55, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $25, %k0, %k1 +; AVX512BW-NEXT: shlq $57, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $59, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: shlq $60, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movl %r11d, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $61, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $24, %k0, %k1 +; AVX512BW-NEXT: shlq $62, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $23, %k0, %k1 +; AVX512BW-NEXT: movzbl %r11b, %r13d +; AVX512BW-NEXT: shlq $63, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrq $22, %k0, %k1 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $32, %k0, %k1 +; AVX512BW-NEXT: movq %r13, 16(%rsi) +; AVX512BW-NEXT: kmovd %k1, %esi +; AVX512BW-NEXT: kshiftrq $33, %k0, %k1 +; AVX512BW-NEXT: movzbl %r15b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: leaq (%r14,%r14,2), %r14 +; AVX512BW-NEXT: leaq (%r14,%rax,4), %r14 +; AVX512BW-NEXT: leaq (%r14,%rax,8), %r14 +; AVX512BW-NEXT: shlq $4, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movzbl %r11b, %r14d +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %r11 +; AVX512BW-NEXT: shlq $5, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: shlq $7, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movzbl %r12b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: shlq $8, %r15 +; AVX512BW-NEXT: orq %r14, %r15 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $9, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrq $34, %k0, %k1 +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $11, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $35, %k0, %k1 +; AVX512BW-NEXT: shlq $13, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %r9b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $14, %r9 +; AVX512BW-NEXT: orq %r12, %r9 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $15, %r12 +; AVX512BW-NEXT: orq %r9, %r12 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrq $36, %k0, %k1 +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %r8b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r8 +; AVX512BW-NEXT: shlq $17, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $18, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrq $37, %k0, %k1 +; AVX512BW-NEXT: shlq $19, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %dil, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $20, %rdi +; AVX512BW-NEXT: orq %r12, %rdi +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $21, %r12 +; AVX512BW-NEXT: orq %rdi, %r12 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrq $38, %k0, %k1 +; AVX512BW-NEXT: shlq $22, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %dl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rdx +; AVX512BW-NEXT: shlq $23, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $24, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: kshiftrq $39, %k0, %k1 +; AVX512BW-NEXT: shlq $25, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %cl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $26, %rcx +; AVX512BW-NEXT: orq %r12, %rcx +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $27, %r12 +; AVX512BW-NEXT: orq %rcx, %r12 +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kshiftrq $40, %k0, %k1 +; AVX512BW-NEXT: shlq $28, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rbp +; AVX512BW-NEXT: shlq $29, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $41, %k0, %k0 +; AVX512BW-NEXT: shlq $31, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %esi +; AVX512BW-NEXT: movq %rsi, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %rsi, %r13 +; AVX512BW-NEXT: shlq $33, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k0, %r12d +; AVX512BW-NEXT: shlq $34, %rsi +; AVX512BW-NEXT: orq %r13, %rsi +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $35, %rax +; AVX512BW-NEXT: orq %rsi, %rax +; AVX512BW-NEXT: movq %r14, %rsi +; AVX512BW-NEXT: shlq $36, %rsi +; AVX512BW-NEXT: orq %rax, %rsi +; AVX512BW-NEXT: shlq $37, %r14 +; AVX512BW-NEXT: orq %rsi, %r14 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %r15, %rsi +; AVX512BW-NEXT: shlq $39, %rsi +; AVX512BW-NEXT: orq %rax, %rsi +; AVX512BW-NEXT: shlq $40, %r15 +; AVX512BW-NEXT: orq %rsi, %r15 +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $41, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r9, %rsi +; AVX512BW-NEXT: shlq $42, %rsi +; AVX512BW-NEXT: orq %rax, %rsi +; AVX512BW-NEXT: shlq $43, %r9 +; AVX512BW-NEXT: orq %rsi, %r9 +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r8, %rsi +; AVX512BW-NEXT: shlq $45, %rsi +; AVX512BW-NEXT: orq %rax, %rsi +; AVX512BW-NEXT: shlq $46, %r8 +; AVX512BW-NEXT: orq %rsi, %r8 +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $47, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rdi, %rsi +; AVX512BW-NEXT: shlq $48, %rsi +; AVX512BW-NEXT: orq %rax, %rsi +; AVX512BW-NEXT: shlq $49, %rdi +; AVX512BW-NEXT: orq %rsi, %rdi +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movq %rdx, %rsi +; AVX512BW-NEXT: shlq $51, %rsi +; AVX512BW-NEXT: orq %rax, %rsi +; AVX512BW-NEXT: shlq $52, %rdx +; AVX512BW-NEXT: orq %rsi, %rdx +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shlq $53, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $54, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: shlq $55, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %rbp, %rcx +; AVX512BW-NEXT: shlq $57, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $58, %rbp +; AVX512BW-NEXT: orq %rcx, %rbp +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $59, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $60, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $61, %r12 +; AVX512BW-NEXT: orq %rcx, %r12 +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: shlq $62, %r10 +; AVX512BW-NEXT: orq %r12, %r10 +; AVX512BW-NEXT: shlq $63, %rbx +; AVX512BW-NEXT: orq %r10, %rbx +; AVX512BW-NEXT: orq %r11, %rbx +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: movq %rbx, 8(%rax) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r13 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: retq + %src.vec = load <64 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <192 x i32> + store <192 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor4_vf2(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-SLOW-LABEL: mask_replication_factor4_vf2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 +; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512F-SLOW-NEXT: kmovw %k0, %eax +; AVX512F-SLOW-NEXT: movb %al, (%rsi) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: mask_replication_factor4_vf2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 +; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512F-FAST-NEXT: kmovw %k0, %eax +; AVX512F-FAST-NEXT: movb %al, (%rsi) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: mask_replication_factor4_vf2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k0 +; AVX512DQ-SLOW-NEXT: kmovb %k0, (%rsi) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: mask_replication_factor4_vf2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k0 +; AVX512DQ-FAST-NEXT: kmovb %k0, (%rsi) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: mask_replication_factor4_vf2: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: kmovw (%rdi), %k1 +; AVX512BW-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512BW-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512BW-SLOW-NEXT: kmovd %k0, %eax +; AVX512BW-SLOW-NEXT: movb %al, (%rsi) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: mask_replication_factor4_vf2: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: kmovw (%rdi), %k1 +; AVX512BW-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512BW-FAST-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512BW-FAST-NEXT: kmovd %k0, %eax +; AVX512BW-FAST-NEXT: movb %al, (%rsi) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq +; +; AVX512VBMI-SLOW-LABEL: mask_replication_factor4_vf2: +; AVX512VBMI-SLOW: # %bb.0: +; AVX512VBMI-SLOW-NEXT: kmovw (%rdi), %k1 +; AVX512VBMI-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VBMI-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VBMI-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512VBMI-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512VBMI-SLOW-NEXT: kmovd %k0, %eax +; AVX512VBMI-SLOW-NEXT: movb %al, (%rsi) +; AVX512VBMI-SLOW-NEXT: vzeroupper +; AVX512VBMI-SLOW-NEXT: retq +; +; AVX512VBMI-FAST-LABEL: mask_replication_factor4_vf2: +; AVX512VBMI-FAST: # %bb.0: +; AVX512VBMI-FAST-NEXT: kmovw (%rdi), %k1 +; AVX512VBMI-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VBMI-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VBMI-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VBMI-FAST-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512VBMI-FAST-NEXT: kmovd %k0, %eax +; AVX512VBMI-FAST-NEXT: movb %al, (%rsi) +; AVX512VBMI-FAST-NEXT: vzeroupper +; AVX512VBMI-FAST-NEXT: retq + %src.vec = load <2 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <8 x i32> + store <8 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor4_vf4(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor4_vf4: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, (%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor4_vf4: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor4_vf4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovw %k0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %src.vec = load <4 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <16 x i32> + store <16 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor4_vf8(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor4_vf8: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, 2(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k0, (%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor4_vf8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: kmovw %k1, 2(%rsi) +; AVX512DQ-NEXT: kmovw %k0, (%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor4_vf8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %src.vec = load <8 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <32 x i32> + store <32 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor4_vf16(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor4_vf16: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 +; AVX512F-ONLY-NEXT: kmovw %k3, 4(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k2, 6(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k1, 2(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k0, (%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor4_vf16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 +; AVX512DQ-NEXT: kmovw %k3, 4(%rsi) +; AVX512DQ-NEXT: kmovw %k2, 6(%rsi) +; AVX512DQ-NEXT: kmovw %k1, 2(%rsi) +; AVX512DQ-NEXT: kmovw %k0, (%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k0 +; AVX512BW-ONLY-NEXT: kmovq %k0, (%rsi) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k0 +; AVX512VBMI-ONLY-NEXT: kmovq %k0, (%rsi) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq + %src.vec = load <16 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <64 x i32> + store <64 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor4_vf32(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor4_vf32: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k3 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 +; AVX512F-ONLY-NEXT: kmovw %k7, 12(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k6, 14(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k5, 8(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k3, 10(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k4, 4(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k2, 6(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k1, (%rsi) +; AVX512F-ONLY-NEXT: kmovw %k0, 2(%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor4_vf32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 +; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7 +; AVX512DQ-NEXT: kmovw %k7, 12(%rsi) +; AVX512DQ-NEXT: kmovw %k6, 14(%rsi) +; AVX512DQ-NEXT: kmovw %k5, 8(%rsi) +; AVX512DQ-NEXT: kmovw %k3, 10(%rsi) +; AVX512DQ-NEXT: kmovw %k4, 4(%rsi) +; AVX512DQ-NEXT: kmovw %k2, 6(%rsi) +; AVX512DQ-NEXT: kmovw %k1, (%rsi) +; AVX512DQ-NEXT: kmovw %k0, 2(%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf32: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: kmovq %k1, 8(%rsi) +; AVX512BW-ONLY-NEXT: kmovq %k0, (%rsi) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf32: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: kmovq %k1, 8(%rsi) +; AVX512VBMI-ONLY-NEXT: kmovq %k0, (%rsi) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq + %src.vec = load <32 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <128 x i32> + store <128 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor4_vf64(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor4_vf64: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 28(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 30(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 24(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 26(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 20(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 22(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 16(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 18(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 12(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 14(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 8(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 10(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 4(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 6(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, (%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 2(%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor4_vf64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpmovd2m %zmm9, %k0 +; AVX512DQ-NEXT: kmovw %k0, 28(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k0 +; AVX512DQ-NEXT: kmovw %k0, 30(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k0 +; AVX512DQ-NEXT: kmovw %k0, 24(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512DQ-NEXT: kmovw %k0, 26(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512DQ-NEXT: kmovw %k0, 20(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm15, %k0 +; AVX512DQ-NEXT: kmovw %k0, 22(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm14, %k0 +; AVX512DQ-NEXT: kmovw %k0, 16(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k0 +; AVX512DQ-NEXT: kmovw %k0, 18(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: kmovw %k0, 12(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm12, %k0 +; AVX512DQ-NEXT: kmovw %k0, 14(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm11, %k0 +; AVX512DQ-NEXT: kmovw %k0, 8(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm10, %k0 +; AVX512DQ-NEXT: kmovw %k0, 10(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, 4(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k0 +; AVX512DQ-NEXT: kmovw %k0, 6(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm6, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k0 +; AVX512DQ-NEXT: kmovw %k0, 2(%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf64: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5,4,5,4,5] +; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512BW-ONLY-NEXT: kmovq %k3, 16(%rsi) +; AVX512BW-ONLY-NEXT: kmovq %k2, 24(%rsi) +; AVX512BW-ONLY-NEXT: kmovq %k1, 8(%rsi) +; AVX512BW-ONLY-NEXT: kmovq %k0, (%rsi) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf64: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [48,48,48,48,49,49,49,49,50,50,50,50,51,51,51,51,52,52,52,52,53,53,53,53,54,54,54,54,55,55,55,55,56,56,56,56,57,57,57,57,58,58,58,58,59,59,59,59,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,32,32,33,33,33,33,34,34,34,34,35,35,35,35,36,36,36,36,37,37,37,37,38,38,38,38,39,39,39,39,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,44,44,44,44,45,45,45,45,46,46,46,46,47,47,47,47] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512VBMI-ONLY-NEXT: kmovq %k3, 16(%rsi) +; AVX512VBMI-ONLY-NEXT: kmovq %k2, 24(%rsi) +; AVX512VBMI-ONLY-NEXT: kmovq %k1, 8(%rsi) +; AVX512VBMI-ONLY-NEXT: kmovq %k0, (%rsi) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq + %src.vec = load <64 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <256 x i32> + store <256 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor5_vf2(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor5_vf2: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k0, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leal (%rax,%rax,2), %ecx +; AVX512F-ONLY-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512F-ONLY-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512F-ONLY-NEXT: shll $4, %eax +; AVX512F-ONLY-NEXT: orl %ecx, %eax +; AVX512F-ONLY-NEXT: kmovw %k1, %ecx +; AVX512F-ONLY-NEXT: movl %ecx, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: shll $5, %edi +; AVX512F-ONLY-NEXT: orl %eax, %edi +; AVX512F-ONLY-NEXT: movl %edx, %eax +; AVX512F-ONLY-NEXT: shll $6, %eax +; AVX512F-ONLY-NEXT: movl %edx, %r8d +; AVX512F-ONLY-NEXT: shll $7, %r8d +; AVX512F-ONLY-NEXT: orl %eax, %r8d +; AVX512F-ONLY-NEXT: shll $8, %edx +; AVX512F-ONLY-NEXT: orl %r8d, %edx +; AVX512F-ONLY-NEXT: shll $9, %ecx +; AVX512F-ONLY-NEXT: orl %edx, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: andl $1023, %ecx # imm = 0x3FF +; AVX512F-ONLY-NEXT: movw %cx, (%rsi) +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor5_vf2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: kshiftrb $1, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leal (%rax,%rax,2), %ecx +; AVX512DQ-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512DQ-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512DQ-NEXT: shll $4, %eax +; AVX512DQ-NEXT: orl %ecx, %eax +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: movl %ecx, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: shll $5, %edi +; AVX512DQ-NEXT: orl %eax, %edi +; AVX512DQ-NEXT: movl %edx, %eax +; AVX512DQ-NEXT: shll $6, %eax +; AVX512DQ-NEXT: movl %edx, %r8d +; AVX512DQ-NEXT: shll $7, %r8d +; AVX512DQ-NEXT: orl %eax, %r8d +; AVX512DQ-NEXT: shll $8, %edx +; AVX512DQ-NEXT: orl %r8d, %edx +; AVX512DQ-NEXT: shll $9, %ecx +; AVX512DQ-NEXT: orl %edx, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: andl $1023, %ecx # imm = 0x3FF +; AVX512DQ-NEXT: movw %cx, (%rsi) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor5_vf2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: kshiftrw $1, %k0, %k1 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leal (%rax,%rax,2), %ecx +; AVX512BW-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512BW-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512BW-NEXT: shll $4, %eax +; AVX512BW-NEXT: orl %ecx, %eax +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: movl %ecx, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: shll $5, %edi +; AVX512BW-NEXT: orl %eax, %edi +; AVX512BW-NEXT: movl %edx, %eax +; AVX512BW-NEXT: shll $6, %eax +; AVX512BW-NEXT: movl %edx, %r8d +; AVX512BW-NEXT: shll $7, %r8d +; AVX512BW-NEXT: orl %eax, %r8d +; AVX512BW-NEXT: shll $8, %edx +; AVX512BW-NEXT: orl %r8d, %edx +; AVX512BW-NEXT: shll $9, %ecx +; AVX512BW-NEXT: orl %edx, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: andl $1023, %ecx # imm = 0x3FF +; AVX512BW-NEXT: movw %cx, (%rsi) +; AVX512BW-NEXT: retq + %src.vec = load <2 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <10 x i32> + store <10 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor5_vf4(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor5_vf4: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k2 +; AVX512F-ONLY-NEXT: kshiftrw $3, %k2, %k0 +; AVX512F-ONLY-NEXT: kshiftrw $2, %k2, %k1 +; AVX512F-ONLY-NEXT: kshiftrw $1, %k2, %k3 +; AVX512F-ONLY-NEXT: kmovw %k2, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: leal (%rcx,%rcx,2), %eax +; AVX512F-ONLY-NEXT: leal (%rax,%rcx,4), %eax +; AVX512F-ONLY-NEXT: leal (%rax,%rcx,8), %eax +; AVX512F-ONLY-NEXT: shll $4, %ecx +; AVX512F-ONLY-NEXT: orl %eax, %ecx +; AVX512F-ONLY-NEXT: kmovw %k3, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movl %edx, %eax +; AVX512F-ONLY-NEXT: shll $5, %eax +; AVX512F-ONLY-NEXT: orl %ecx, %eax +; AVX512F-ONLY-NEXT: movl %edx, %ecx +; AVX512F-ONLY-NEXT: shll $6, %ecx +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: shll $7, %edi +; AVX512F-ONLY-NEXT: orl %ecx, %edi +; AVX512F-ONLY-NEXT: movl %edx, %ecx +; AVX512F-ONLY-NEXT: shll $8, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: shll $9, %edx +; AVX512F-ONLY-NEXT: orl %ecx, %edx +; AVX512F-ONLY-NEXT: kmovw %k1, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movl %edi, %ecx +; AVX512F-ONLY-NEXT: shll $10, %ecx +; AVX512F-ONLY-NEXT: orl %edx, %ecx +; AVX512F-ONLY-NEXT: movl %edi, %edx +; AVX512F-ONLY-NEXT: shll $11, %edx +; AVX512F-ONLY-NEXT: orl %ecx, %edx +; AVX512F-ONLY-NEXT: movl %edi, %ecx +; AVX512F-ONLY-NEXT: shll $12, %ecx +; AVX512F-ONLY-NEXT: orl %edx, %ecx +; AVX512F-ONLY-NEXT: movl %edi, %edx +; AVX512F-ONLY-NEXT: shll $13, %edx +; AVX512F-ONLY-NEXT: orl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $14, %edi +; AVX512F-ONLY-NEXT: orl %edx, %edi +; AVX512F-ONLY-NEXT: kmovw %k0, %ecx +; AVX512F-ONLY-NEXT: movl %ecx, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movl %edx, %r8d +; AVX512F-ONLY-NEXT: shll $15, %r8d +; AVX512F-ONLY-NEXT: orl %edi, %r8d +; AVX512F-ONLY-NEXT: orl %eax, %r8d +; AVX512F-ONLY-NEXT: movw %r8w, (%rsi) +; AVX512F-ONLY-NEXT: movl %edx, %eax +; AVX512F-ONLY-NEXT: shll $16, %eax +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: shll $17, %edi +; AVX512F-ONLY-NEXT: orl %eax, %edi +; AVX512F-ONLY-NEXT: shll $18, %edx +; AVX512F-ONLY-NEXT: orl %edi, %edx +; AVX512F-ONLY-NEXT: shll $19, %ecx +; AVX512F-ONLY-NEXT: orl %edx, %ecx +; AVX512F-ONLY-NEXT: orl %r8d, %ecx +; AVX512F-ONLY-NEXT: shrl $16, %ecx +; AVX512F-ONLY-NEXT: andl $15, %ecx +; AVX512F-ONLY-NEXT: movb %cl, 2(%rsi) +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor5_vf4: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k2 +; AVX512DQ-NEXT: kshiftrb $3, %k2, %k0 +; AVX512DQ-NEXT: kshiftrb $2, %k2, %k1 +; AVX512DQ-NEXT: kshiftrb $1, %k2, %k3 +; AVX512DQ-NEXT: kmovw %k2, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: leal (%rcx,%rcx,2), %eax +; AVX512DQ-NEXT: leal (%rax,%rcx,4), %eax +; AVX512DQ-NEXT: leal (%rax,%rcx,8), %eax +; AVX512DQ-NEXT: shll $4, %ecx +; AVX512DQ-NEXT: orl %eax, %ecx +; AVX512DQ-NEXT: kmovw %k3, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movl %edx, %eax +; AVX512DQ-NEXT: shll $5, %eax +; AVX512DQ-NEXT: orl %ecx, %eax +; AVX512DQ-NEXT: movl %edx, %ecx +; AVX512DQ-NEXT: shll $6, %ecx +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: shll $7, %edi +; AVX512DQ-NEXT: orl %ecx, %edi +; AVX512DQ-NEXT: movl %edx, %ecx +; AVX512DQ-NEXT: shll $8, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: shll $9, %edx +; AVX512DQ-NEXT: orl %ecx, %edx +; AVX512DQ-NEXT: kmovw %k1, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movl %edi, %ecx +; AVX512DQ-NEXT: shll $10, %ecx +; AVX512DQ-NEXT: orl %edx, %ecx +; AVX512DQ-NEXT: movl %edi, %edx +; AVX512DQ-NEXT: shll $11, %edx +; AVX512DQ-NEXT: orl %ecx, %edx +; AVX512DQ-NEXT: movl %edi, %ecx +; AVX512DQ-NEXT: shll $12, %ecx +; AVX512DQ-NEXT: orl %edx, %ecx +; AVX512DQ-NEXT: movl %edi, %edx +; AVX512DQ-NEXT: shll $13, %edx +; AVX512DQ-NEXT: orl %ecx, %edx +; AVX512DQ-NEXT: shll $14, %edi +; AVX512DQ-NEXT: orl %edx, %edi +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: movl %ecx, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movl %edx, %r8d +; AVX512DQ-NEXT: shll $15, %r8d +; AVX512DQ-NEXT: orl %edi, %r8d +; AVX512DQ-NEXT: orl %eax, %r8d +; AVX512DQ-NEXT: movw %r8w, (%rsi) +; AVX512DQ-NEXT: movl %edx, %eax +; AVX512DQ-NEXT: shll $16, %eax +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: shll $17, %edi +; AVX512DQ-NEXT: orl %eax, %edi +; AVX512DQ-NEXT: shll $18, %edx +; AVX512DQ-NEXT: orl %edi, %edx +; AVX512DQ-NEXT: shll $19, %ecx +; AVX512DQ-NEXT: orl %edx, %ecx +; AVX512DQ-NEXT: orl %r8d, %ecx +; AVX512DQ-NEXT: shrl $16, %ecx +; AVX512DQ-NEXT: andl $15, %ecx +; AVX512DQ-NEXT: movb %cl, 2(%rsi) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor5_vf4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k2 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k3 +; AVX512BW-NEXT: kmovd %k2, %ecx +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: leal (%rcx,%rcx,2), %eax +; AVX512BW-NEXT: leal (%rax,%rcx,4), %eax +; AVX512BW-NEXT: leal (%rax,%rcx,8), %eax +; AVX512BW-NEXT: shll $4, %ecx +; AVX512BW-NEXT: orl %eax, %ecx +; AVX512BW-NEXT: kmovd %k3, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movl %edx, %eax +; AVX512BW-NEXT: shll $5, %eax +; AVX512BW-NEXT: orl %ecx, %eax +; AVX512BW-NEXT: movl %edx, %ecx +; AVX512BW-NEXT: shll $6, %ecx +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: shll $7, %edi +; AVX512BW-NEXT: orl %ecx, %edi +; AVX512BW-NEXT: movl %edx, %ecx +; AVX512BW-NEXT: shll $8, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: shll $9, %edx +; AVX512BW-NEXT: orl %ecx, %edx +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movl %edi, %ecx +; AVX512BW-NEXT: shll $10, %ecx +; AVX512BW-NEXT: orl %edx, %ecx +; AVX512BW-NEXT: movl %edi, %edx +; AVX512BW-NEXT: shll $11, %edx +; AVX512BW-NEXT: orl %ecx, %edx +; AVX512BW-NEXT: movl %edi, %ecx +; AVX512BW-NEXT: shll $12, %ecx +; AVX512BW-NEXT: orl %edx, %ecx +; AVX512BW-NEXT: movl %edi, %edx +; AVX512BW-NEXT: shll $13, %edx +; AVX512BW-NEXT: orl %ecx, %edx +; AVX512BW-NEXT: shll $14, %edi +; AVX512BW-NEXT: orl %edx, %edi +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: movl %ecx, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movl %edx, %r8d +; AVX512BW-NEXT: shll $15, %r8d +; AVX512BW-NEXT: orl %edi, %r8d +; AVX512BW-NEXT: orl %eax, %r8d +; AVX512BW-NEXT: movw %r8w, (%rsi) +; AVX512BW-NEXT: movl %edx, %eax +; AVX512BW-NEXT: shll $16, %eax +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: shll $17, %edi +; AVX512BW-NEXT: orl %eax, %edi +; AVX512BW-NEXT: shll $18, %edx +; AVX512BW-NEXT: orl %edi, %edx +; AVX512BW-NEXT: shll $19, %ecx +; AVX512BW-NEXT: orl %edx, %ecx +; AVX512BW-NEXT: orl %r8d, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: andl $15, %ecx +; AVX512BW-NEXT: movb %cl, 2(%rsi) +; AVX512BW-NEXT: retq + %src.vec = load <4 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <20 x i32> + store <20 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor5_vf8(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor5_vf8: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512F-ONLY-NEXT: kshiftrw $7, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %eax +; AVX512F-ONLY-NEXT: kshiftrw $6, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $5, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %edi +; AVX512F-ONLY-NEXT: kshiftrw $4, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %edx +; AVX512F-ONLY-NEXT: kmovw %k0, %r11d +; AVX512F-ONLY-NEXT: movzbl %r11b, %ebx +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: leaq (%rbx,%rbx,2), %r11 +; AVX512F-ONLY-NEXT: leaq (%r11,%rbx,4), %r11 +; AVX512F-ONLY-NEXT: leaq (%r11,%rbx,8), %r11 +; AVX512F-ONLY-NEXT: shlq $4, %rbx +; AVX512F-ONLY-NEXT: orq %r11, %rbx +; AVX512F-ONLY-NEXT: movzbl %dl, %r11d +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: movq %r11, %rdx +; AVX512F-ONLY-NEXT: shlq $5, %rdx +; AVX512F-ONLY-NEXT: orq %rbx, %rdx +; AVX512F-ONLY-NEXT: movq %r11, %rbx +; AVX512F-ONLY-NEXT: shlq $6, %rbx +; AVX512F-ONLY-NEXT: movq %r11, %r14 +; AVX512F-ONLY-NEXT: shlq $7, %r14 +; AVX512F-ONLY-NEXT: orq %rbx, %r14 +; AVX512F-ONLY-NEXT: movq %r11, %rbx +; AVX512F-ONLY-NEXT: shlq $8, %rbx +; AVX512F-ONLY-NEXT: orq %r14, %rbx +; AVX512F-ONLY-NEXT: shlq $9, %r11 +; AVX512F-ONLY-NEXT: orq %rbx, %r11 +; AVX512F-ONLY-NEXT: movzbl %r10b, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rbx +; AVX512F-ONLY-NEXT: shlq $10, %rbx +; AVX512F-ONLY-NEXT: orq %r11, %rbx +; AVX512F-ONLY-NEXT: movq %r10, %r11 +; AVX512F-ONLY-NEXT: shlq $11, %r11 +; AVX512F-ONLY-NEXT: orq %rbx, %r11 +; AVX512F-ONLY-NEXT: movq %r10, %rbx +; AVX512F-ONLY-NEXT: shlq $12, %rbx +; AVX512F-ONLY-NEXT: orq %r11, %rbx +; AVX512F-ONLY-NEXT: movq %r10, %r11 +; AVX512F-ONLY-NEXT: shlq $13, %r11 +; AVX512F-ONLY-NEXT: orq %rbx, %r11 +; AVX512F-ONLY-NEXT: shlq $14, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: movzbl %r9b, %r9d +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %r11 +; AVX512F-ONLY-NEXT: shlq $15, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $16, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: movq %r9, %r11 +; AVX512F-ONLY-NEXT: shlq $17, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $18, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: shlq $19, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movzbl %r8b, %r8d +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %r10 +; AVX512F-ONLY-NEXT: shlq $20, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $21, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movq %r8, %r10 +; AVX512F-ONLY-NEXT: shlq $22, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $23, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: shlq $24, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: movzbl %dil, %r9d +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %rdi +; AVX512F-ONLY-NEXT: shlq $25, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $26, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %r9, %rdi +; AVX512F-ONLY-NEXT: shlq $27, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $28, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $29, %r9 +; AVX512F-ONLY-NEXT: orq %r8, %r9 +; AVX512F-ONLY-NEXT: movzbl %cl, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $30, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $31, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movl %edi, (%rsi) +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $32, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %r8 +; AVX512F-ONLY-NEXT: shlq $33, %r8 +; AVX512F-ONLY-NEXT: orq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $34, %rcx +; AVX512F-ONLY-NEXT: orq %r8, %rcx +; AVX512F-ONLY-NEXT: movzbl %al, %edx +; AVX512F-ONLY-NEXT: # kill: def $eax killed $eax def $rax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $35, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $36, %rcx +; AVX512F-ONLY-NEXT: orq %r8, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $37, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $39, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: shrq $32, %rdx +; AVX512F-ONLY-NEXT: movb %dl, 4(%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor5_vf8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: kshiftrb $7, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: kshiftrb $5, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %edi +; AVX512DQ-NEXT: kshiftrb $4, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r8d +; AVX512DQ-NEXT: kshiftrb $3, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r9d +; AVX512DQ-NEXT: kshiftrb $2, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r10d +; AVX512DQ-NEXT: kshiftrb $1, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %edx +; AVX512DQ-NEXT: kmovw %k0, %r11d +; AVX512DQ-NEXT: movzbl %r11b, %ebx +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: leaq (%rbx,%rbx,2), %r11 +; AVX512DQ-NEXT: leaq (%r11,%rbx,4), %r11 +; AVX512DQ-NEXT: leaq (%r11,%rbx,8), %r11 +; AVX512DQ-NEXT: shlq $4, %rbx +; AVX512DQ-NEXT: orq %r11, %rbx +; AVX512DQ-NEXT: movzbl %dl, %r11d +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: movq %r11, %rdx +; AVX512DQ-NEXT: shlq $5, %rdx +; AVX512DQ-NEXT: orq %rbx, %rdx +; AVX512DQ-NEXT: movq %r11, %rbx +; AVX512DQ-NEXT: shlq $6, %rbx +; AVX512DQ-NEXT: movq %r11, %r14 +; AVX512DQ-NEXT: shlq $7, %r14 +; AVX512DQ-NEXT: orq %rbx, %r14 +; AVX512DQ-NEXT: movq %r11, %rbx +; AVX512DQ-NEXT: shlq $8, %rbx +; AVX512DQ-NEXT: orq %r14, %rbx +; AVX512DQ-NEXT: shlq $9, %r11 +; AVX512DQ-NEXT: orq %rbx, %r11 +; AVX512DQ-NEXT: movzbl %r10b, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rbx +; AVX512DQ-NEXT: shlq $10, %rbx +; AVX512DQ-NEXT: orq %r11, %rbx +; AVX512DQ-NEXT: movq %r10, %r11 +; AVX512DQ-NEXT: shlq $11, %r11 +; AVX512DQ-NEXT: orq %rbx, %r11 +; AVX512DQ-NEXT: movq %r10, %rbx +; AVX512DQ-NEXT: shlq $12, %rbx +; AVX512DQ-NEXT: orq %r11, %rbx +; AVX512DQ-NEXT: movq %r10, %r11 +; AVX512DQ-NEXT: shlq $13, %r11 +; AVX512DQ-NEXT: orq %rbx, %r11 +; AVX512DQ-NEXT: shlq $14, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: movzbl %r9b, %r9d +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %r11 +; AVX512DQ-NEXT: shlq $15, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movq %r9, %r10 +; AVX512DQ-NEXT: shlq $16, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: movq %r9, %r11 +; AVX512DQ-NEXT: shlq $17, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movq %r9, %r10 +; AVX512DQ-NEXT: shlq $18, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: shlq $19, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movzbl %r8b, %r8d +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %r10 +; AVX512DQ-NEXT: shlq $20, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $21, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movq %r8, %r10 +; AVX512DQ-NEXT: shlq $22, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $23, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: shlq $24, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: movzbl %dil, %r9d +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %rdi +; AVX512DQ-NEXT: shlq $25, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %r9, %r8 +; AVX512DQ-NEXT: shlq $26, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %r9, %rdi +; AVX512DQ-NEXT: shlq $27, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %r9, %r8 +; AVX512DQ-NEXT: shlq $28, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: shlq $29, %r9 +; AVX512DQ-NEXT: orq %r8, %r9 +; AVX512DQ-NEXT: movzbl %cl, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $30, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: shlq $31, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movl %edi, (%rsi) +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $32, %rdx +; AVX512DQ-NEXT: movq %rcx, %r8 +; AVX512DQ-NEXT: shlq $33, %r8 +; AVX512DQ-NEXT: orq %rdx, %r8 +; AVX512DQ-NEXT: shlq $34, %rcx +; AVX512DQ-NEXT: orq %r8, %rcx +; AVX512DQ-NEXT: movzbl %al, %edx +; AVX512DQ-NEXT: # kill: def $eax killed $eax def $rax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $35, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $36, %rcx +; AVX512DQ-NEXT: orq %r8, %rcx +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $37, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: shlq $39, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: shrq $32, %rdx +; AVX512DQ-NEXT: movb %dl, 4(%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor5_vf8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: kshiftrw $7, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrw $6, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kshiftrw $5, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrw $4, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrw $3, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrw $2, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrw $1, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: kmovd %k0, %r11d +; AVX512BW-NEXT: movzbl %r11b, %ebx +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: leaq (%rbx,%rbx,2), %r11 +; AVX512BW-NEXT: leaq (%r11,%rbx,4), %r11 +; AVX512BW-NEXT: leaq (%r11,%rbx,8), %r11 +; AVX512BW-NEXT: shlq $4, %rbx +; AVX512BW-NEXT: orq %r11, %rbx +; AVX512BW-NEXT: movzbl %dl, %r11d +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %rdx +; AVX512BW-NEXT: shlq $5, %rdx +; AVX512BW-NEXT: orq %rbx, %rdx +; AVX512BW-NEXT: movq %r11, %rbx +; AVX512BW-NEXT: shlq $6, %rbx +; AVX512BW-NEXT: movq %r11, %r14 +; AVX512BW-NEXT: shlq $7, %r14 +; AVX512BW-NEXT: orq %rbx, %r14 +; AVX512BW-NEXT: movq %r11, %rbx +; AVX512BW-NEXT: shlq $8, %rbx +; AVX512BW-NEXT: orq %r14, %rbx +; AVX512BW-NEXT: shlq $9, %r11 +; AVX512BW-NEXT: orq %rbx, %r11 +; AVX512BW-NEXT: movzbl %r10b, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %rbx +; AVX512BW-NEXT: shlq $10, %rbx +; AVX512BW-NEXT: orq %r11, %rbx +; AVX512BW-NEXT: movq %r10, %r11 +; AVX512BW-NEXT: shlq $11, %r11 +; AVX512BW-NEXT: orq %rbx, %r11 +; AVX512BW-NEXT: movq %r10, %rbx +; AVX512BW-NEXT: shlq $12, %rbx +; AVX512BW-NEXT: orq %r11, %rbx +; AVX512BW-NEXT: movq %r10, %r11 +; AVX512BW-NEXT: shlq $13, %r11 +; AVX512BW-NEXT: orq %rbx, %r11 +; AVX512BW-NEXT: shlq $14, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: movzbl %r9b, %r9d +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %r11 +; AVX512BW-NEXT: shlq $15, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: shlq $16, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: movq %r9, %r11 +; AVX512BW-NEXT: shlq $17, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: shlq $18, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: shlq $19, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movzbl %r8b, %r8d +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: movq %r8, %r10 +; AVX512BW-NEXT: shlq $20, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %r8, %r9 +; AVX512BW-NEXT: shlq $21, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movq %r8, %r10 +; AVX512BW-NEXT: shlq $22, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %r8, %r9 +; AVX512BW-NEXT: shlq $23, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: shlq $24, %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: movzbl %dil, %r9d +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %rdi +; AVX512BW-NEXT: shlq $25, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: movq %r9, %r8 +; AVX512BW-NEXT: shlq $26, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: movq %r9, %rdi +; AVX512BW-NEXT: shlq $27, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: movq %r9, %r8 +; AVX512BW-NEXT: shlq $28, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: shlq $29, %r9 +; AVX512BW-NEXT: orq %r8, %r9 +; AVX512BW-NEXT: movzbl %cl, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $30, %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: shlq $31, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movl %edi, (%rsi) +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $32, %rdx +; AVX512BW-NEXT: movq %rcx, %r8 +; AVX512BW-NEXT: shlq $33, %r8 +; AVX512BW-NEXT: orq %rdx, %r8 +; AVX512BW-NEXT: shlq $34, %rcx +; AVX512BW-NEXT: orq %r8, %rcx +; AVX512BW-NEXT: movzbl %al, %edx +; AVX512BW-NEXT: # kill: def $eax killed $eax def $rax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $35, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $36, %rcx +; AVX512BW-NEXT: orq %r8, %rcx +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $37, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: shlq $39, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: shrq $32, %rdx +; AVX512BW-NEXT: movb %dl, 4(%rsi) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: retq + %src.vec = load <8 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <40 x i32> + store <40 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor5_vf16(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor5_vf16: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbp +; AVX512F-ONLY-NEXT: pushq %r15 +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %r13 +; AVX512F-ONLY-NEXT: pushq %r12 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512F-ONLY-NEXT: kshiftrw $12, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %edx +; AVX512F-ONLY-NEXT: kshiftrw $6, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $5, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %eax +; AVX512F-ONLY-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-ONLY-NEXT: kshiftrw $4, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %edi +; AVX512F-ONLY-NEXT: kshiftrw $3, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $15, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %eax +; AVX512F-ONLY-NEXT: kshiftrw $7, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: movl %edx, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: leal (%r14,%r11,2), %ebx +; AVX512F-ONLY-NEXT: leal (%rbx,%r11,4), %ebx +; AVX512F-ONLY-NEXT: leal (%rbx,%r11,8), %ebx +; AVX512F-ONLY-NEXT: movl %r11d, %r12d +; AVX512F-ONLY-NEXT: shll $4, %r12d +; AVX512F-ONLY-NEXT: orl %ebx, %r12d +; AVX512F-ONLY-NEXT: shll $5, %r11d +; AVX512F-ONLY-NEXT: orl %r12d, %r11d +; AVX512F-ONLY-NEXT: andl $1, %ebp +; AVX512F-ONLY-NEXT: movl %ebp, %ebx +; AVX512F-ONLY-NEXT: shll $6, %ebx +; AVX512F-ONLY-NEXT: movl %ebp, %r12d +; AVX512F-ONLY-NEXT: shll $7, %r12d +; AVX512F-ONLY-NEXT: orl %ebx, %r12d +; AVX512F-ONLY-NEXT: movl %ebp, %ebx +; AVX512F-ONLY-NEXT: shll $8, %ebx +; AVX512F-ONLY-NEXT: orl %r12d, %ebx +; AVX512F-ONLY-NEXT: movl %ebp, %r12d +; AVX512F-ONLY-NEXT: shll $9, %r12d +; AVX512F-ONLY-NEXT: orl %ebx, %r12d +; AVX512F-ONLY-NEXT: shll $10, %ebp +; AVX512F-ONLY-NEXT: orl %r12d, %ebp +; AVX512F-ONLY-NEXT: movl %eax, %ebx +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movl %ebx, %r12d +; AVX512F-ONLY-NEXT: shll $11, %r12d +; AVX512F-ONLY-NEXT: orl %ebp, %r12d +; AVX512F-ONLY-NEXT: movl %ebx, %ebp +; AVX512F-ONLY-NEXT: shll $12, %ebp +; AVX512F-ONLY-NEXT: orl %r12d, %ebp +; AVX512F-ONLY-NEXT: movl %ebx, %r13d +; AVX512F-ONLY-NEXT: shll $13, %r13d +; AVX512F-ONLY-NEXT: orl %ebp, %r13d +; AVX512F-ONLY-NEXT: kmovw %k1, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k0, %k1 +; AVX512F-ONLY-NEXT: shll $14, %ebx +; AVX512F-ONLY-NEXT: orl %r13d, %ebx +; AVX512F-ONLY-NEXT: kmovw %k1, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k0, %k1 +; AVX512F-ONLY-NEXT: shll $15, %eax +; AVX512F-ONLY-NEXT: orl %ebx, %eax +; AVX512F-ONLY-NEXT: orl %r11d, %eax +; AVX512F-ONLY-NEXT: movw %ax, 8(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k0, %eax +; AVX512F-ONLY-NEXT: movzbl %al, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leaq (%rax,%rax,2), %r11 +; AVX512F-ONLY-NEXT: leaq (%r11,%rax,4), %r11 +; AVX512F-ONLY-NEXT: leaq (%r11,%rax,8), %r11 +; AVX512F-ONLY-NEXT: shlq $4, %rax +; AVX512F-ONLY-NEXT: orq %r11, %rax +; AVX512F-ONLY-NEXT: movzbl %r10b, %ebp +; AVX512F-ONLY-NEXT: andl $1, %ebp +; AVX512F-ONLY-NEXT: movq %rbp, %r10 +; AVX512F-ONLY-NEXT: shlq $5, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $6, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %r11 +; AVX512F-ONLY-NEXT: shlq $7, %r11 +; AVX512F-ONLY-NEXT: orq %rax, %r11 +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r11, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k0, %k0 +; AVX512F-ONLY-NEXT: shlq $9, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: movzbl %r9b, %ebx +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %rbp, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r9 +; AVX512F-ONLY-NEXT: shlq $11, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %rbp +; AVX512F-ONLY-NEXT: shlq $13, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: kmovw %k0, %r9d +; AVX512F-ONLY-NEXT: movzbl %dl, %edx +; AVX512F-ONLY-NEXT: shlq $14, %rbx +; AVX512F-ONLY-NEXT: orq %rbp, %rbx +; AVX512F-ONLY-NEXT: movzbl %r8b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $15, %r8 +; AVX512F-ONLY-NEXT: orq %rbx, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $16, %rbx +; AVX512F-ONLY-NEXT: orq %r8, %rbx +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $17, %r8 +; AVX512F-ONLY-NEXT: orq %rbx, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $18, %rbx +; AVX512F-ONLY-NEXT: orq %r8, %rbx +; AVX512F-ONLY-NEXT: shlq $19, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movzbl %dil, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $20, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $21, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $22, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $23, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $24, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $25, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $26, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $27, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $28, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: shlq $29, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: movzbl %cl, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $30, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $31, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $33, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $34, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rcx +; AVX512F-ONLY-NEXT: shlq $36, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $37, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rcx +; AVX512F-ONLY-NEXT: shlq $38, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $39, %r15 +; AVX512F-ONLY-NEXT: orq %rcx, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $41, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $43, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $44, %r12 +; AVX512F-ONLY-NEXT: orq %rcx, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $45, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $46, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $47, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $48, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $49, %r13 +; AVX512F-ONLY-NEXT: orq %rcx, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: movq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r11, %rcx +; AVX512F-ONLY-NEXT: shlq $51, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r11, %rcx +; AVX512F-ONLY-NEXT: shlq $53, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $54, %r11 +; AVX512F-ONLY-NEXT: orq %rcx, %r11 +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %r11, %rax +; AVX512F-ONLY-NEXT: movq %r9, %rcx +; AVX512F-ONLY-NEXT: shlq $56, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r9, %rcx +; AVX512F-ONLY-NEXT: shlq $58, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $59, %r9 +; AVX512F-ONLY-NEXT: orq %rcx, %r9 +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $60, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %r14, %rcx +; AVX512F-ONLY-NEXT: shlq $61, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $62, %r14 +; AVX512F-ONLY-NEXT: orq %rcx, %r14 +; AVX512F-ONLY-NEXT: shlq $63, %rdx +; AVX512F-ONLY-NEXT: orq %r14, %rdx +; AVX512F-ONLY-NEXT: orq %r10, %rdx +; AVX512F-ONLY-NEXT: movq %rdx, (%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r12 +; AVX512F-ONLY-NEXT: popq %r13 +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: popq %r15 +; AVX512F-ONLY-NEXT: popq %rbp +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor5_vf16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %r13 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %edx +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %edi +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r8d +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r9d +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r10d +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r11d +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ebp +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r15d +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: movl %edx, %r14d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: leal (%r14,%r11,2), %ebx +; AVX512DQ-NEXT: leal (%rbx,%r11,4), %ebx +; AVX512DQ-NEXT: leal (%rbx,%r11,8), %ebx +; AVX512DQ-NEXT: movl %r11d, %r12d +; AVX512DQ-NEXT: shll $4, %r12d +; AVX512DQ-NEXT: orl %ebx, %r12d +; AVX512DQ-NEXT: shll $5, %r11d +; AVX512DQ-NEXT: orl %r12d, %r11d +; AVX512DQ-NEXT: andl $1, %ebp +; AVX512DQ-NEXT: movl %ebp, %ebx +; AVX512DQ-NEXT: shll $6, %ebx +; AVX512DQ-NEXT: movl %ebp, %r12d +; AVX512DQ-NEXT: shll $7, %r12d +; AVX512DQ-NEXT: orl %ebx, %r12d +; AVX512DQ-NEXT: movl %ebp, %ebx +; AVX512DQ-NEXT: shll $8, %ebx +; AVX512DQ-NEXT: orl %r12d, %ebx +; AVX512DQ-NEXT: movl %ebp, %r12d +; AVX512DQ-NEXT: shll $9, %r12d +; AVX512DQ-NEXT: orl %ebx, %r12d +; AVX512DQ-NEXT: shll $10, %ebp +; AVX512DQ-NEXT: orl %r12d, %ebp +; AVX512DQ-NEXT: movl %eax, %ebx +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movl %ebx, %r12d +; AVX512DQ-NEXT: shll $11, %r12d +; AVX512DQ-NEXT: orl %ebp, %r12d +; AVX512DQ-NEXT: movl %ebx, %ebp +; AVX512DQ-NEXT: shll $12, %ebp +; AVX512DQ-NEXT: orl %r12d, %ebp +; AVX512DQ-NEXT: movl %ebx, %r13d +; AVX512DQ-NEXT: shll $13, %r13d +; AVX512DQ-NEXT: orl %ebp, %r13d +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k1 +; AVX512DQ-NEXT: shll $14, %ebx +; AVX512DQ-NEXT: orl %r13d, %ebx +; AVX512DQ-NEXT: kmovw %k1, %r13d +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k1 +; AVX512DQ-NEXT: shll $15, %eax +; AVX512DQ-NEXT: orl %ebx, %eax +; AVX512DQ-NEXT: orl %r11d, %eax +; AVX512DQ-NEXT: movw %ax, 8(%rsi) +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: movzbl %al, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leaq (%rax,%rax,2), %r11 +; AVX512DQ-NEXT: leaq (%r11,%rax,4), %r11 +; AVX512DQ-NEXT: leaq (%r11,%rax,8), %r11 +; AVX512DQ-NEXT: shlq $4, %rax +; AVX512DQ-NEXT: orq %r11, %rax +; AVX512DQ-NEXT: movzbl %r10b, %ebp +; AVX512DQ-NEXT: andl $1, %ebp +; AVX512DQ-NEXT: movq %rbp, %r10 +; AVX512DQ-NEXT: shlq $5, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $6, %rax +; AVX512DQ-NEXT: movq %rbp, %r11 +; AVX512DQ-NEXT: shlq $7, %r11 +; AVX512DQ-NEXT: orq %rax, %r11 +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r11, %rax +; AVX512DQ-NEXT: kmovw %k1, %r11d +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512DQ-NEXT: shlq $9, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: movzbl %r9b, %ebx +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %rbp, %rax +; AVX512DQ-NEXT: movq %rbx, %r9 +; AVX512DQ-NEXT: shlq $11, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %rbx, %rbp +; AVX512DQ-NEXT: shlq $13, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: kmovw %k0, %r9d +; AVX512DQ-NEXT: movzbl %dl, %edx +; AVX512DQ-NEXT: shlq $14, %rbx +; AVX512DQ-NEXT: orq %rbp, %rbx +; AVX512DQ-NEXT: movzbl %r8b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $15, %r8 +; AVX512DQ-NEXT: orq %rbx, %r8 +; AVX512DQ-NEXT: movq %rax, %rbx +; AVX512DQ-NEXT: shlq $16, %rbx +; AVX512DQ-NEXT: orq %r8, %rbx +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $17, %r8 +; AVX512DQ-NEXT: orq %rbx, %r8 +; AVX512DQ-NEXT: movq %rax, %rbx +; AVX512DQ-NEXT: shlq $18, %rbx +; AVX512DQ-NEXT: orq %r8, %rbx +; AVX512DQ-NEXT: shlq $19, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movzbl %dil, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $20, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $21, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $22, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $23, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: shlq $24, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $25, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $26, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $27, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $28, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: shlq $29, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: movzbl %cl, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $30, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: shlq $31, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $33, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: shlq $34, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $35, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r15, %rcx +; AVX512DQ-NEXT: shlq $36, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $37, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r15, %rcx +; AVX512DQ-NEXT: shlq $38, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $39, %r15 +; AVX512DQ-NEXT: orq %rcx, %r15 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $41, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $43, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $44, %r12 +; AVX512DQ-NEXT: orq %rcx, %r12 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $45, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $46, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $47, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $48, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $49, %r13 +; AVX512DQ-NEXT: orq %rcx, %r13 +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: movq %r11, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r11, %rcx +; AVX512DQ-NEXT: shlq $51, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r11, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r11, %rcx +; AVX512DQ-NEXT: shlq $53, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $54, %r11 +; AVX512DQ-NEXT: orq %rcx, %r11 +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %rax +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %r11, %rax +; AVX512DQ-NEXT: movq %r9, %rcx +; AVX512DQ-NEXT: shlq $56, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r9, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r9, %rcx +; AVX512DQ-NEXT: shlq $58, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $59, %r9 +; AVX512DQ-NEXT: orq %rcx, %r9 +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $60, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %r14, %rcx +; AVX512DQ-NEXT: shlq $61, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $62, %r14 +; AVX512DQ-NEXT: orq %rcx, %r14 +; AVX512DQ-NEXT: shlq $63, %rdx +; AVX512DQ-NEXT: orq %r14, %rdx +; AVX512DQ-NEXT: orq %r10, %rdx +; AVX512DQ-NEXT: movq %rdx, (%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r13 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor5_vf16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r13 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: kshiftrw $6, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kshiftrw $5, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrw $4, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrw $3, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrw $2, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrw $1, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrw $13, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrw $14, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrw $15, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrw $7, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movl %edx, %r14d +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: leal (%r14,%r11,2), %ebx +; AVX512BW-NEXT: leal (%rbx,%r11,4), %ebx +; AVX512BW-NEXT: leal (%rbx,%r11,8), %ebx +; AVX512BW-NEXT: movl %r11d, %r12d +; AVX512BW-NEXT: shll $4, %r12d +; AVX512BW-NEXT: orl %ebx, %r12d +; AVX512BW-NEXT: shll $5, %r11d +; AVX512BW-NEXT: orl %r12d, %r11d +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movl %ebp, %ebx +; AVX512BW-NEXT: shll $6, %ebx +; AVX512BW-NEXT: movl %ebp, %r12d +; AVX512BW-NEXT: shll $7, %r12d +; AVX512BW-NEXT: orl %ebx, %r12d +; AVX512BW-NEXT: movl %ebp, %ebx +; AVX512BW-NEXT: shll $8, %ebx +; AVX512BW-NEXT: orl %r12d, %ebx +; AVX512BW-NEXT: movl %ebp, %r12d +; AVX512BW-NEXT: shll $9, %r12d +; AVX512BW-NEXT: orl %ebx, %r12d +; AVX512BW-NEXT: shll $10, %ebp +; AVX512BW-NEXT: orl %r12d, %ebp +; AVX512BW-NEXT: movl %eax, %ebx +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movl %ebx, %r12d +; AVX512BW-NEXT: shll $11, %r12d +; AVX512BW-NEXT: orl %ebp, %r12d +; AVX512BW-NEXT: movl %ebx, %ebp +; AVX512BW-NEXT: shll $12, %ebp +; AVX512BW-NEXT: orl %r12d, %ebp +; AVX512BW-NEXT: movl %ebx, %r13d +; AVX512BW-NEXT: shll $13, %r13d +; AVX512BW-NEXT: orl %ebp, %r13d +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrw $9, %k0, %k1 +; AVX512BW-NEXT: shll $14, %ebx +; AVX512BW-NEXT: orl %r13d, %ebx +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrw $10, %k0, %k1 +; AVX512BW-NEXT: shll $15, %eax +; AVX512BW-NEXT: orl %ebx, %eax +; AVX512BW-NEXT: orl %r11d, %eax +; AVX512BW-NEXT: movw %ax, 8(%rsi) +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: movzbl %al, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %r11 +; AVX512BW-NEXT: leaq (%r11,%rax,4), %r11 +; AVX512BW-NEXT: leaq (%r11,%rax,8), %r11 +; AVX512BW-NEXT: shlq $4, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movzbl %r10b, %ebp +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movq %rbp, %r10 +; AVX512BW-NEXT: shlq $5, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movq %rbp, %r11 +; AVX512BW-NEXT: shlq $7, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 +; AVX512BW-NEXT: shlq $9, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: movzbl %r9b, %ebx +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: movq %rbx, %r9 +; AVX512BW-NEXT: shlq $11, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %rbx, %rbp +; AVX512BW-NEXT: shlq $13, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k0, %r9d +; AVX512BW-NEXT: movzbl %dl, %edx +; AVX512BW-NEXT: shlq $14, %rbx +; AVX512BW-NEXT: orq %rbp, %rbx +; AVX512BW-NEXT: movzbl %r8b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $15, %r8 +; AVX512BW-NEXT: orq %rbx, %r8 +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: shlq $16, %rbx +; AVX512BW-NEXT: orq %r8, %rbx +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $17, %r8 +; AVX512BW-NEXT: orq %rbx, %r8 +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: shlq $18, %rbx +; AVX512BW-NEXT: orq %r8, %rbx +; AVX512BW-NEXT: shlq $19, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movzbl %dil, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $20, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $21, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $22, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $23, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: shlq $24, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $25, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $26, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $27, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $28, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: shlq $29, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movzbl %cl, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $30, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: shlq $31, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $33, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: shlq $34, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $35, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r15, %rcx +; AVX512BW-NEXT: shlq $36, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $37, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r15, %rcx +; AVX512BW-NEXT: shlq $38, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $39, %r15 +; AVX512BW-NEXT: orq %rcx, %r15 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $41, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $43, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $44, %r12 +; AVX512BW-NEXT: orq %rcx, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $45, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rcx +; AVX512BW-NEXT: shlq $46, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $47, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r13, %rcx +; AVX512BW-NEXT: shlq $48, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $49, %r13 +; AVX512BW-NEXT: orq %rcx, %r13 +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r11, %rcx +; AVX512BW-NEXT: shlq $51, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r11, %rcx +; AVX512BW-NEXT: shlq $53, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $54, %r11 +; AVX512BW-NEXT: orq %rcx, %r11 +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $55, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r9, %rcx +; AVX512BW-NEXT: shlq $56, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $57, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r9, %rcx +; AVX512BW-NEXT: shlq $58, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $59, %r9 +; AVX512BW-NEXT: orq %rcx, %r9 +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r14, %rcx +; AVX512BW-NEXT: shlq $61, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $62, %r14 +; AVX512BW-NEXT: orq %rcx, %r14 +; AVX512BW-NEXT: shlq $63, %rdx +; AVX512BW-NEXT: orq %r14, %rdx +; AVX512BW-NEXT: orq %r10, %rdx +; AVX512BW-NEXT: movq %rdx, (%rsi) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r13 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: retq + %src.vec = load <16 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <80 x i32> + store <80 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor5_vf32(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor5_vf32: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbp +; AVX512F-ONLY-NEXT: pushq %r15 +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %r13 +; AVX512F-ONLY-NEXT: pushq %r12 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k0 +; AVX512F-ONLY-NEXT: kshiftrw $9, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $7, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-ONLY-NEXT: kshiftrw $3, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %edx +; AVX512F-ONLY-NEXT: kshiftrw $2, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %edi +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $15, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $10, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k0, %k2 +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movl %ecx, %r9d +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: leal (%r9,%r9,2), %r13d +; AVX512F-ONLY-NEXT: leal (%r13,%r14,4), %r13d +; AVX512F-ONLY-NEXT: leal (%r13,%r14,8), %ebp +; AVX512F-ONLY-NEXT: movl %r14d, %r13d +; AVX512F-ONLY-NEXT: shll $4, %r13d +; AVX512F-ONLY-NEXT: orl %ebp, %r13d +; AVX512F-ONLY-NEXT: movl %r14d, %ebp +; AVX512F-ONLY-NEXT: shll $5, %ebp +; AVX512F-ONLY-NEXT: orl %r13d, %ebp +; AVX512F-ONLY-NEXT: shll $6, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movl %r15d, %r13d +; AVX512F-ONLY-NEXT: shll $7, %r13d +; AVX512F-ONLY-NEXT: orl %r14d, %r13d +; AVX512F-ONLY-NEXT: movl %r15d, %r14d +; AVX512F-ONLY-NEXT: shll $8, %r14d +; AVX512F-ONLY-NEXT: orl %r13d, %r14d +; AVX512F-ONLY-NEXT: movl %r15d, %r13d +; AVX512F-ONLY-NEXT: shll $9, %r13d +; AVX512F-ONLY-NEXT: orl %r14d, %r13d +; AVX512F-ONLY-NEXT: movl %r15d, %eax +; AVX512F-ONLY-NEXT: shll $10, %eax +; AVX512F-ONLY-NEXT: orl %r13d, %eax +; AVX512F-ONLY-NEXT: kmovw %k2, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k0, %k2 +; AVX512F-ONLY-NEXT: shll $11, %r15d +; AVX512F-ONLY-NEXT: orl %eax, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movl %r12d, %eax +; AVX512F-ONLY-NEXT: shll $12, %eax +; AVX512F-ONLY-NEXT: orl %r15d, %eax +; AVX512F-ONLY-NEXT: movl %r12d, %r15d +; AVX512F-ONLY-NEXT: shll $13, %r15d +; AVX512F-ONLY-NEXT: orl %eax, %r15d +; AVX512F-ONLY-NEXT: movl %r12d, %eax +; AVX512F-ONLY-NEXT: shll $14, %eax +; AVX512F-ONLY-NEXT: orl %r15d, %eax +; AVX512F-ONLY-NEXT: movl %r12d, %r13d +; AVX512F-ONLY-NEXT: shll $15, %r13d +; AVX512F-ONLY-NEXT: orl %eax, %r13d +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $15, %k0, %k2 +; AVX512F-ONLY-NEXT: shll $16, %r12d +; AVX512F-ONLY-NEXT: orl %r13d, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movl %r14d, %eax +; AVX512F-ONLY-NEXT: shll $17, %eax +; AVX512F-ONLY-NEXT: orl %r12d, %eax +; AVX512F-ONLY-NEXT: movl %r14d, %r12d +; AVX512F-ONLY-NEXT: shll $18, %r12d +; AVX512F-ONLY-NEXT: orl %eax, %r12d +; AVX512F-ONLY-NEXT: movl %r14d, %eax +; AVX512F-ONLY-NEXT: shll $19, %eax +; AVX512F-ONLY-NEXT: orl %r12d, %eax +; AVX512F-ONLY-NEXT: movl %r14d, %r12d +; AVX512F-ONLY-NEXT: shll $20, %r12d +; AVX512F-ONLY-NEXT: orl %eax, %r12d +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: kshiftrw $6, %k1, %k2 +; AVX512F-ONLY-NEXT: shll $21, %r14d +; AVX512F-ONLY-NEXT: orl %r12d, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movl %r15d, %r12d +; AVX512F-ONLY-NEXT: shll $22, %r12d +; AVX512F-ONLY-NEXT: orl %r14d, %r12d +; AVX512F-ONLY-NEXT: movl %r15d, %r14d +; AVX512F-ONLY-NEXT: shll $23, %r14d +; AVX512F-ONLY-NEXT: orl %r12d, %r14d +; AVX512F-ONLY-NEXT: movl %r15d, %r12d +; AVX512F-ONLY-NEXT: shll $24, %r12d +; AVX512F-ONLY-NEXT: orl %r14d, %r12d +; AVX512F-ONLY-NEXT: movl %r15d, %r14d +; AVX512F-ONLY-NEXT: shll $25, %r14d +; AVX512F-ONLY-NEXT: orl %r12d, %r14d +; AVX512F-ONLY-NEXT: shll $26, %r15d +; AVX512F-ONLY-NEXT: orl %r14d, %r15d +; AVX512F-ONLY-NEXT: movl %eax, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movl %r14d, %r12d +; AVX512F-ONLY-NEXT: shll $27, %r12d +; AVX512F-ONLY-NEXT: orl %r15d, %r12d +; AVX512F-ONLY-NEXT: movl %r14d, %r15d +; AVX512F-ONLY-NEXT: shll $28, %r15d +; AVX512F-ONLY-NEXT: orl %r12d, %r15d +; AVX512F-ONLY-NEXT: movl %r14d, %r12d +; AVX512F-ONLY-NEXT: shll $29, %r12d +; AVX512F-ONLY-NEXT: orl %r15d, %r12d +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k1, %k2 +; AVX512F-ONLY-NEXT: shll $30, %r14d +; AVX512F-ONLY-NEXT: orl %r12d, %r14d +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k1, %k2 +; AVX512F-ONLY-NEXT: shll $31, %eax +; AVX512F-ONLY-NEXT: orl %r14d, %eax +; AVX512F-ONLY-NEXT: orl %ebp, %eax +; AVX512F-ONLY-NEXT: movl %eax, 16(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k1, %eax +; AVX512F-ONLY-NEXT: movzbl %al, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leaq (%rax,%rax,2), %r14 +; AVX512F-ONLY-NEXT: leaq (%r14,%rax,4), %r14 +; AVX512F-ONLY-NEXT: leaq (%r14,%rax,8), %r14 +; AVX512F-ONLY-NEXT: shlq $4, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movzbl %r12b, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %r14 +; AVX512F-ONLY-NEXT: shlq $5, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $6, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $7, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $3, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $9, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $11, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $13, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: kshiftrw $4, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $14, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $15, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $16, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $17, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $18, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: kshiftrw $5, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $19, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $20, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $21, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $22, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $23, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: kshiftrw $12, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $24, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $25, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $26, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $27, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $28, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $29, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movzbl %r15b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $30, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $7, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $32, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $34, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $36, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $37, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $38, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $39, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $41, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $43, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $44, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $45, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $46, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $47, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $48, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k1, %k1 +; AVX512F-ONLY-NEXT: shlq $49, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $51, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $53, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k1, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $54, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $56, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $58, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $59, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r15d +; AVX512F-ONLY-NEXT: movl %ebp, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $60, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $61, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k1, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $5, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $62, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $63, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k1, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k0, %k1 +; AVX512F-ONLY-NEXT: orq %r14, %r13 +; AVX512F-ONLY-NEXT: kmovw %k1, %r14d +; AVX512F-ONLY-NEXT: movq %r13, (%rsi) +; AVX512F-ONLY-NEXT: movzbl %bl, %ebx +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: leaq (%r15,%rbx,2), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%rbx,4), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%rbx,8), %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r15 +; AVX512F-ONLY-NEXT: shlq $4, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $5, %rbx +; AVX512F-ONLY-NEXT: orq %r15, %rbx +; AVX512F-ONLY-NEXT: movzbl %r11b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r11 +; AVX512F-ONLY-NEXT: shlq $6, %r11 +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $7, %r15 +; AVX512F-ONLY-NEXT: orq %r11, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r11 +; AVX512F-ONLY-NEXT: shlq $8, %r11 +; AVX512F-ONLY-NEXT: orq %r15, %r11 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $9, %r13 +; AVX512F-ONLY-NEXT: orq %r11, %r13 +; AVX512F-ONLY-NEXT: kmovw %k0, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k0, %k0 +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movzbl %r10b, %r11d +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: movq %r11, %r10 +; AVX512F-ONLY-NEXT: shlq $11, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r11, %r10 +; AVX512F-ONLY-NEXT: shlq $13, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: kmovw %k0, %r10d +; AVX512F-ONLY-NEXT: shlq $15, %r11 +; AVX512F-ONLY-NEXT: orq %rax, %r11 +; AVX512F-ONLY-NEXT: movzbl %r15b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $16, %r15 +; AVX512F-ONLY-NEXT: orq %r11, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r11 +; AVX512F-ONLY-NEXT: shlq $17, %r11 +; AVX512F-ONLY-NEXT: orq %r15, %r11 +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $18, %r15 +; AVX512F-ONLY-NEXT: orq %r11, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r11 +; AVX512F-ONLY-NEXT: shlq $19, %r11 +; AVX512F-ONLY-NEXT: orq %r15, %r11 +; AVX512F-ONLY-NEXT: shlq $20, %rax +; AVX512F-ONLY-NEXT: orq %r11, %rax +; AVX512F-ONLY-NEXT: movzbl %r8b, %r8d +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %r11 +; AVX512F-ONLY-NEXT: shlq $21, %r11 +; AVX512F-ONLY-NEXT: orq %rax, %r11 +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $22, %rax +; AVX512F-ONLY-NEXT: orq %r11, %rax +; AVX512F-ONLY-NEXT: movq %r8, %r11 +; AVX512F-ONLY-NEXT: shlq $23, %r11 +; AVX512F-ONLY-NEXT: orq %rax, %r11 +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $24, %rax +; AVX512F-ONLY-NEXT: orq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $25, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movzbl %dil, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $26, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $27, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $28, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $29, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $30, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movzbl %dl, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $31, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $33, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: andl $1, %ebp +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %rdx +; AVX512F-ONLY-NEXT: shlq $37, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %rdx +; AVX512F-ONLY-NEXT: shlq $39, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $40, %rbp +; AVX512F-ONLY-NEXT: orq %rdx, %rbp +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $41, %rax +; AVX512F-ONLY-NEXT: orq %rbp, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rdx +; AVX512F-ONLY-NEXT: shlq $42, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $43, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rdx +; AVX512F-ONLY-NEXT: shlq $44, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $45, %r12 +; AVX512F-ONLY-NEXT: orq %rdx, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r14, %rdx +; AVX512F-ONLY-NEXT: shlq $47, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movq %r14, %rdx +; AVX512F-ONLY-NEXT: shlq $49, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $50, %r14 +; AVX512F-ONLY-NEXT: orq %rdx, %r14 +; AVX512F-ONLY-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $51, %rdx +; AVX512F-ONLY-NEXT: orq %r14, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $52, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $53, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $54, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rdx +; AVX512F-ONLY-NEXT: shlq $56, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movq %r10, %rdx +; AVX512F-ONLY-NEXT: shlq $58, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $59, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $60, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $61, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $62, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movzbl %cl, %eax +; AVX512F-ONLY-NEXT: shlq $63, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %rax, 8(%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r12 +; AVX512F-ONLY-NEXT: popq %r13 +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: popq %r15 +; AVX512F-ONLY-NEXT: popq %rbp +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor5_vf32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %r13 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovw (%rdi), %k1 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %ecx +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %edx +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %edi +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r8d +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r10d +; AVX512DQ-NEXT: kshiftrw $14, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r11d +; AVX512DQ-NEXT: kshiftrw $13, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %ebx +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r14d +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k2 +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movl %ecx, %r9d +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: leal (%r9,%r9,2), %r13d +; AVX512DQ-NEXT: leal (%r13,%r14,4), %r13d +; AVX512DQ-NEXT: leal (%r13,%r14,8), %ebp +; AVX512DQ-NEXT: movl %r14d, %r13d +; AVX512DQ-NEXT: shll $4, %r13d +; AVX512DQ-NEXT: orl %ebp, %r13d +; AVX512DQ-NEXT: movl %r14d, %ebp +; AVX512DQ-NEXT: shll $5, %ebp +; AVX512DQ-NEXT: orl %r13d, %ebp +; AVX512DQ-NEXT: shll $6, %r14d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movl %r15d, %r13d +; AVX512DQ-NEXT: shll $7, %r13d +; AVX512DQ-NEXT: orl %r14d, %r13d +; AVX512DQ-NEXT: movl %r15d, %r14d +; AVX512DQ-NEXT: shll $8, %r14d +; AVX512DQ-NEXT: orl %r13d, %r14d +; AVX512DQ-NEXT: movl %r15d, %r13d +; AVX512DQ-NEXT: shll $9, %r13d +; AVX512DQ-NEXT: orl %r14d, %r13d +; AVX512DQ-NEXT: movl %r15d, %eax +; AVX512DQ-NEXT: shll $10, %eax +; AVX512DQ-NEXT: orl %r13d, %eax +; AVX512DQ-NEXT: kmovw %k2, %r14d +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k2 +; AVX512DQ-NEXT: shll $11, %r15d +; AVX512DQ-NEXT: orl %eax, %r15d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movl %r12d, %eax +; AVX512DQ-NEXT: shll $12, %eax +; AVX512DQ-NEXT: orl %r15d, %eax +; AVX512DQ-NEXT: movl %r12d, %r15d +; AVX512DQ-NEXT: shll $13, %r15d +; AVX512DQ-NEXT: orl %eax, %r15d +; AVX512DQ-NEXT: movl %r12d, %eax +; AVX512DQ-NEXT: shll $14, %eax +; AVX512DQ-NEXT: orl %r15d, %eax +; AVX512DQ-NEXT: movl %r12d, %r13d +; AVX512DQ-NEXT: shll $15, %r13d +; AVX512DQ-NEXT: orl %eax, %r13d +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k2 +; AVX512DQ-NEXT: shll $16, %r12d +; AVX512DQ-NEXT: orl %r13d, %r12d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movl %r14d, %eax +; AVX512DQ-NEXT: shll $17, %eax +; AVX512DQ-NEXT: orl %r12d, %eax +; AVX512DQ-NEXT: movl %r14d, %r12d +; AVX512DQ-NEXT: shll $18, %r12d +; AVX512DQ-NEXT: orl %eax, %r12d +; AVX512DQ-NEXT: movl %r14d, %eax +; AVX512DQ-NEXT: shll $19, %eax +; AVX512DQ-NEXT: orl %r12d, %eax +; AVX512DQ-NEXT: movl %r14d, %r12d +; AVX512DQ-NEXT: shll $20, %r12d +; AVX512DQ-NEXT: orl %eax, %r12d +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k2 +; AVX512DQ-NEXT: shll $21, %r14d +; AVX512DQ-NEXT: orl %r12d, %r14d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movl %r15d, %r12d +; AVX512DQ-NEXT: shll $22, %r12d +; AVX512DQ-NEXT: orl %r14d, %r12d +; AVX512DQ-NEXT: movl %r15d, %r14d +; AVX512DQ-NEXT: shll $23, %r14d +; AVX512DQ-NEXT: orl %r12d, %r14d +; AVX512DQ-NEXT: movl %r15d, %r12d +; AVX512DQ-NEXT: shll $24, %r12d +; AVX512DQ-NEXT: orl %r14d, %r12d +; AVX512DQ-NEXT: movl %r15d, %r14d +; AVX512DQ-NEXT: shll $25, %r14d +; AVX512DQ-NEXT: orl %r12d, %r14d +; AVX512DQ-NEXT: shll $26, %r15d +; AVX512DQ-NEXT: orl %r14d, %r15d +; AVX512DQ-NEXT: movl %eax, %r14d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movl %r14d, %r12d +; AVX512DQ-NEXT: shll $27, %r12d +; AVX512DQ-NEXT: orl %r15d, %r12d +; AVX512DQ-NEXT: movl %r14d, %r15d +; AVX512DQ-NEXT: shll $28, %r15d +; AVX512DQ-NEXT: orl %r12d, %r15d +; AVX512DQ-NEXT: movl %r14d, %r12d +; AVX512DQ-NEXT: shll $29, %r12d +; AVX512DQ-NEXT: orl %r15d, %r12d +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k2 +; AVX512DQ-NEXT: shll $30, %r14d +; AVX512DQ-NEXT: orl %r12d, %r14d +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $2, %k1, %k2 +; AVX512DQ-NEXT: shll $31, %eax +; AVX512DQ-NEXT: orl %r14d, %eax +; AVX512DQ-NEXT: orl %ebp, %eax +; AVX512DQ-NEXT: movl %eax, 16(%rsi) +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: movzbl %al, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leaq (%rax,%rax,2), %r14 +; AVX512DQ-NEXT: leaq (%r14,%rax,4), %r14 +; AVX512DQ-NEXT: leaq (%r14,%rax,8), %r14 +; AVX512DQ-NEXT: shlq $4, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movzbl %r12b, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %r14 +; AVX512DQ-NEXT: shlq $5, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $6, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $7, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $3, %k1, %k2 +; AVX512DQ-NEXT: shlq $9, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $11, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $13, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: kshiftrw $4, %k1, %k2 +; AVX512DQ-NEXT: shlq $14, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %al, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $15, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $16, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $17, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $18, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: kshiftrw $5, %k1, %k2 +; AVX512DQ-NEXT: shlq $19, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $20, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $21, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $22, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $23, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: kshiftrw $12, %k1, %k2 +; AVX512DQ-NEXT: shlq $24, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %al, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $25, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $26, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $27, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $28, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $29, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movzbl %r15b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $30, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $7, %k1, %k2 +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $32, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $33, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 +; AVX512DQ-NEXT: shlq $34, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $35, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $36, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $37, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $38, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k2 +; AVX512DQ-NEXT: shlq $39, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $41, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $43, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $10, %k1, %k2 +; AVX512DQ-NEXT: shlq $44, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $45, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $46, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $47, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $48, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k1 +; AVX512DQ-NEXT: shlq $49, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $51, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $53, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k1, %r13d +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512DQ-NEXT: shlq $54, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $56, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $58, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: shlq $59, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movzbl %bpl, %r15d +; AVX512DQ-NEXT: movl %ebp, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $60, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $61, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k1, %ebp +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k1 +; AVX512DQ-NEXT: shlq $62, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $63, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512DQ-NEXT: orq %r14, %r13 +; AVX512DQ-NEXT: kmovw %k1, %r14d +; AVX512DQ-NEXT: movq %r13, (%rsi) +; AVX512DQ-NEXT: movzbl %bl, %ebx +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: leaq (%r15,%rbx,2), %rax +; AVX512DQ-NEXT: leaq (%rax,%rbx,4), %rax +; AVX512DQ-NEXT: leaq (%rax,%rbx,8), %rax +; AVX512DQ-NEXT: movq %rbx, %r15 +; AVX512DQ-NEXT: shlq $4, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: shlq $5, %rbx +; AVX512DQ-NEXT: orq %r15, %rbx +; AVX512DQ-NEXT: movzbl %r11b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r11 +; AVX512DQ-NEXT: shlq $6, %r11 +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $7, %r15 +; AVX512DQ-NEXT: orq %r11, %r15 +; AVX512DQ-NEXT: movq %rax, %r11 +; AVX512DQ-NEXT: shlq $8, %r11 +; AVX512DQ-NEXT: orq %r15, %r11 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $9, %r13 +; AVX512DQ-NEXT: orq %r11, %r13 +; AVX512DQ-NEXT: kmovw %k0, %r15d +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movzbl %r10b, %r11d +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: movq %r11, %r10 +; AVX512DQ-NEXT: shlq $11, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r11, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r11, %r10 +; AVX512DQ-NEXT: shlq $13, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r11, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: kmovw %k0, %r10d +; AVX512DQ-NEXT: shlq $15, %r11 +; AVX512DQ-NEXT: orq %rax, %r11 +; AVX512DQ-NEXT: movzbl %r15b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $16, %r15 +; AVX512DQ-NEXT: orq %r11, %r15 +; AVX512DQ-NEXT: movq %rax, %r11 +; AVX512DQ-NEXT: shlq $17, %r11 +; AVX512DQ-NEXT: orq %r15, %r11 +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $18, %r15 +; AVX512DQ-NEXT: orq %r11, %r15 +; AVX512DQ-NEXT: movq %rax, %r11 +; AVX512DQ-NEXT: shlq $19, %r11 +; AVX512DQ-NEXT: orq %r15, %r11 +; AVX512DQ-NEXT: shlq $20, %rax +; AVX512DQ-NEXT: orq %r11, %rax +; AVX512DQ-NEXT: movzbl %r8b, %r8d +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %r11 +; AVX512DQ-NEXT: shlq $21, %r11 +; AVX512DQ-NEXT: orq %rax, %r11 +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $22, %rax +; AVX512DQ-NEXT: orq %r11, %rax +; AVX512DQ-NEXT: movq %r8, %r11 +; AVX512DQ-NEXT: shlq $23, %r11 +; AVX512DQ-NEXT: orq %rax, %r11 +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $24, %rax +; AVX512DQ-NEXT: orq %r11, %rax +; AVX512DQ-NEXT: shlq $25, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movzbl %dil, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $26, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $27, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $28, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $29, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: shlq $30, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movzbl %dl, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $31, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: movq %rdx, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $33, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: movq %rdx, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: shlq $35, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: andl $1, %ebp +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movq %rbp, %rdx +; AVX512DQ-NEXT: shlq $37, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movq %rbp, %rdx +; AVX512DQ-NEXT: shlq $39, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: shlq $40, %rbp +; AVX512DQ-NEXT: orq %rdx, %rbp +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $41, %rax +; AVX512DQ-NEXT: orq %rbp, %rax +; AVX512DQ-NEXT: movq %r12, %rdx +; AVX512DQ-NEXT: shlq $42, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $43, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movq %r12, %rdx +; AVX512DQ-NEXT: shlq $44, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: shlq $45, %r12 +; AVX512DQ-NEXT: orq %rdx, %r12 +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r14, %rdx +; AVX512DQ-NEXT: shlq $47, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movq %r14, %rdx +; AVX512DQ-NEXT: shlq $49, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: shlq $50, %r14 +; AVX512DQ-NEXT: orq %rdx, %r14 +; AVX512DQ-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $51, %rdx +; AVX512DQ-NEXT: orq %r14, %rdx +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $52, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $53, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $54, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rdx +; AVX512DQ-NEXT: shlq $56, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movq %r10, %rdx +; AVX512DQ-NEXT: shlq $58, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $59, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: shlq $60, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r9, %rax +; AVX512DQ-NEXT: shlq $61, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: shlq $62, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movzbl %cl, %eax +; AVX512DQ-NEXT: shlq $63, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %rax, 8(%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r13 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor5_vf32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r13 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovd (%rdi), %k0 +; AVX512BW-NEXT: kshiftrd $25, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kshiftrd $19, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $18, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: kshiftrd $17, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrd $15, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrd $14, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrd $13, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrd $26, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrd $27, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $29, %k0, %k1 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movl %ecx, %r9d +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: leal (%r9,%r9,2), %r13d +; AVX512BW-NEXT: leal (%r13,%r14,4), %r13d +; AVX512BW-NEXT: leal (%r13,%r14,8), %ebp +; AVX512BW-NEXT: movl %r14d, %r13d +; AVX512BW-NEXT: shll $4, %r13d +; AVX512BW-NEXT: orl %ebp, %r13d +; AVX512BW-NEXT: movl %r14d, %ebp +; AVX512BW-NEXT: shll $5, %ebp +; AVX512BW-NEXT: orl %r13d, %ebp +; AVX512BW-NEXT: shll $6, %r14d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movl %r15d, %r13d +; AVX512BW-NEXT: shll $7, %r13d +; AVX512BW-NEXT: orl %r14d, %r13d +; AVX512BW-NEXT: movl %r15d, %r14d +; AVX512BW-NEXT: shll $8, %r14d +; AVX512BW-NEXT: orl %r13d, %r14d +; AVX512BW-NEXT: movl %r15d, %r13d +; AVX512BW-NEXT: shll $9, %r13d +; AVX512BW-NEXT: orl %r14d, %r13d +; AVX512BW-NEXT: movl %r15d, %eax +; AVX512BW-NEXT: shll $10, %eax +; AVX512BW-NEXT: orl %r13d, %eax +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrd $30, %k0, %k1 +; AVX512BW-NEXT: shll $11, %r15d +; AVX512BW-NEXT: orl %eax, %r15d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movl %r12d, %eax +; AVX512BW-NEXT: shll $12, %eax +; AVX512BW-NEXT: orl %r15d, %eax +; AVX512BW-NEXT: movl %r12d, %r15d +; AVX512BW-NEXT: shll $13, %r15d +; AVX512BW-NEXT: orl %eax, %r15d +; AVX512BW-NEXT: movl %r12d, %eax +; AVX512BW-NEXT: shll $14, %eax +; AVX512BW-NEXT: orl %r15d, %eax +; AVX512BW-NEXT: movl %r12d, %r13d +; AVX512BW-NEXT: shll $15, %r13d +; AVX512BW-NEXT: orl %eax, %r13d +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $31, %k0, %k1 +; AVX512BW-NEXT: shll $16, %r12d +; AVX512BW-NEXT: orl %r13d, %r12d +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movl %r14d, %eax +; AVX512BW-NEXT: shll $17, %eax +; AVX512BW-NEXT: orl %r12d, %eax +; AVX512BW-NEXT: movl %r14d, %r12d +; AVX512BW-NEXT: shll $18, %r12d +; AVX512BW-NEXT: orl %eax, %r12d +; AVX512BW-NEXT: movl %r14d, %eax +; AVX512BW-NEXT: shll $19, %eax +; AVX512BW-NEXT: orl %r12d, %eax +; AVX512BW-NEXT: movl %r14d, %r12d +; AVX512BW-NEXT: shll $20, %r12d +; AVX512BW-NEXT: orl %eax, %r12d +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrd $6, %k0, %k1 +; AVX512BW-NEXT: shll $21, %r14d +; AVX512BW-NEXT: orl %r12d, %r14d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movl %r15d, %r12d +; AVX512BW-NEXT: shll $22, %r12d +; AVX512BW-NEXT: orl %r14d, %r12d +; AVX512BW-NEXT: movl %r15d, %r14d +; AVX512BW-NEXT: shll $23, %r14d +; AVX512BW-NEXT: orl %r12d, %r14d +; AVX512BW-NEXT: movl %r15d, %r12d +; AVX512BW-NEXT: shll $24, %r12d +; AVX512BW-NEXT: orl %r14d, %r12d +; AVX512BW-NEXT: movl %r15d, %r14d +; AVX512BW-NEXT: shll $25, %r14d +; AVX512BW-NEXT: orl %r12d, %r14d +; AVX512BW-NEXT: shll $26, %r15d +; AVX512BW-NEXT: orl %r14d, %r15d +; AVX512BW-NEXT: movl %eax, %r14d +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movl %r14d, %r12d +; AVX512BW-NEXT: shll $27, %r12d +; AVX512BW-NEXT: orl %r15d, %r12d +; AVX512BW-NEXT: movl %r14d, %r15d +; AVX512BW-NEXT: shll $28, %r15d +; AVX512BW-NEXT: orl %r12d, %r15d +; AVX512BW-NEXT: movl %r14d, %r12d +; AVX512BW-NEXT: shll $29, %r12d +; AVX512BW-NEXT: orl %r15d, %r12d +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $1, %k0, %k1 +; AVX512BW-NEXT: shll $30, %r14d +; AVX512BW-NEXT: orl %r12d, %r14d +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $2, %k0, %k1 +; AVX512BW-NEXT: shll $31, %eax +; AVX512BW-NEXT: orl %r14d, %eax +; AVX512BW-NEXT: orl %ebp, %eax +; AVX512BW-NEXT: movl %eax, 16(%rsi) +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: movzbl %al, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %r14 +; AVX512BW-NEXT: leaq (%r14,%rax,4), %r14 +; AVX512BW-NEXT: leaq (%r14,%rax,8), %r14 +; AVX512BW-NEXT: shlq $4, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movzbl %r12b, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %r14 +; AVX512BW-NEXT: shlq $5, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrd $3, %k0, %k1 +; AVX512BW-NEXT: shlq $9, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $11, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $13, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrd $4, %k0, %k1 +; AVX512BW-NEXT: shlq $14, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $15, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $16, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $17, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $18, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrd $5, %k0, %k1 +; AVX512BW-NEXT: shlq $19, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movzbl %al, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $20, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $21, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $22, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $23, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrd $12, %k0, %k1 +; AVX512BW-NEXT: shlq $24, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $25, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $26, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $27, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $28, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: shlq $29, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movzbl %r15b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $30, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrd $7, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $32, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $33, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $8, %k0, %k1 +; AVX512BW-NEXT: shlq $34, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $35, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $36, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $37, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $38, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $9, %k0, %k1 +; AVX512BW-NEXT: shlq $39, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $43, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $10, %k0, %k1 +; AVX512BW-NEXT: shlq $44, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $45, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $46, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $47, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $48, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $11, %k0, %k1 +; AVX512BW-NEXT: shlq $49, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $51, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $53, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrd $20, %k0, %k1 +; AVX512BW-NEXT: shlq $54, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $55, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: shlq $56, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $57, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: shlq $58, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: shlq $59, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movl %ebp, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $60, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $61, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrd $21, %k0, %k1 +; AVX512BW-NEXT: movzbl %bpl, %ebp +; AVX512BW-NEXT: shlq $62, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movq %rbp, %r12 +; AVX512BW-NEXT: shlq $63, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $22, %k0, %k1 +; AVX512BW-NEXT: orq %r14, %r12 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrd $23, %k0, %k1 +; AVX512BW-NEXT: movq %r12, (%rsi) +; AVX512BW-NEXT: movzbl %bl, %ebx +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: leaq (%rbp,%rbx,2), %rax +; AVX512BW-NEXT: leaq (%rax,%rbx,4), %rax +; AVX512BW-NEXT: leaq (%rax,%rbx,8), %rax +; AVX512BW-NEXT: movq %rbx, %r12 +; AVX512BW-NEXT: shlq $4, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: shlq $5, %rbx +; AVX512BW-NEXT: orq %r12, %rbx +; AVX512BW-NEXT: movzbl %r11b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: shlq $6, %r11 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %r11, %r12 +; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: shlq $8, %r11 +; AVX512BW-NEXT: orq %r12, %r11 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $9, %r12 +; AVX512BW-NEXT: orq %r11, %r12 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrd $24, %k0, %k0 +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %r10b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $11, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $13, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: kmovd %k0, %r10d +; AVX512BW-NEXT: shlq $15, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %r8b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $16, %r8 +; AVX512BW-NEXT: orq %r12, %r8 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $17, %r12 +; AVX512BW-NEXT: orq %r8, %r12 +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $18, %r8 +; AVX512BW-NEXT: orq %r12, %r8 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $19, %r12 +; AVX512BW-NEXT: orq %r8, %r12 +; AVX512BW-NEXT: shlq $20, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %dil, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $21, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $22, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $23, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $24, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: shlq $25, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movzbl %dl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $26, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $27, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $28, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $29, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $31, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $33, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: shlq $35, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r13, %rdx +; AVX512BW-NEXT: shlq $37, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r13, %rdx +; AVX512BW-NEXT: shlq $39, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: shlq $40, %r13 +; AVX512BW-NEXT: orq %rdx, %r13 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $41, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %rdx +; AVX512BW-NEXT: shlq $42, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $43, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r15, %rdx +; AVX512BW-NEXT: shlq $44, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: shlq $45, %r15 +; AVX512BW-NEXT: orq %rdx, %r15 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r14, %rdx +; AVX512BW-NEXT: shlq $47, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r14, %rdx +; AVX512BW-NEXT: shlq $49, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: shlq $50, %r14 +; AVX512BW-NEXT: orq %rdx, %r14 +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $51, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %r11, %rdx +; AVX512BW-NEXT: shlq $52, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $53, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r11, %rdx +; AVX512BW-NEXT: shlq $54, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: shlq $55, %r11 +; AVX512BW-NEXT: orq %rdx, %r11 +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r10, %rdx +; AVX512BW-NEXT: shlq $57, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r10, %rdx +; AVX512BW-NEXT: shlq $59, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: shlq $60, %r10 +; AVX512BW-NEXT: orq %rdx, %r10 +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $61, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: shlq $62, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: movzbl %cl, %eax +; AVX512BW-NEXT: shlq $63, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %rax, 8(%rsi) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r13 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: retq + %src.vec = load <32 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <160 x i32> + store <160 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor5_vf64(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor5_vf64: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbp +; AVX512F-ONLY-NEXT: pushq %r15 +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %r13 +; AVX512F-ONLY-NEXT: pushq %r12 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k0 +; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k3 +; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k2 +; AVX512F-ONLY-NEXT: kshiftrw $7, %k0, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-ONLY-NEXT: kshiftrw $3, %k0, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $2, %k0, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %edx +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %edi +; AVX512F-ONLY-NEXT: kshiftrw $15, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k0, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $9, %k2, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $4, %k2, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k1, %r15d +; AVX512F-ONLY-NEXT: movzbl %r15b, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: leaq (%r15,%r15,2), %r12 +; AVX512F-ONLY-NEXT: leaq (%r12,%r15,4), %r12 +; AVX512F-ONLY-NEXT: leaq (%r12,%r15,8), %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movzbl %r11b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r11 +; AVX512F-ONLY-NEXT: shlq $5, %r11 +; AVX512F-ONLY-NEXT: orq %r15, %r11 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $6, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $7, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $8, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $9, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $10, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $11, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $12, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $13, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $14, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $15, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $16, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $17, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $18, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $19, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $20, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $21, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $22, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $23, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $24, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $25, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $26, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $27, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $28, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $29, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movzbl %r10b, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $30, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $7, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $31, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r13 +; AVX512F-ONLY-NEXT: shlq $33, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $34, %r10 +; AVX512F-ONLY-NEXT: orq %r13, %r10 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $36, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $37, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $38, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $39, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $41, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r13 +; AVX512F-ONLY-NEXT: shlq $43, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $44, %r10 +; AVX512F-ONLY-NEXT: orq %r13, %r10 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $45, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $46, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $47, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $48, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $49, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $51, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r13 +; AVX512F-ONLY-NEXT: shlq $53, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $54, %r10 +; AVX512F-ONLY-NEXT: orq %r13, %r10 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $56, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $58, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $59, %r12 +; AVX512F-ONLY-NEXT: orq %r10, %r12 +; AVX512F-ONLY-NEXT: movzbl %r15b, %r10d +; AVX512F-ONLY-NEXT: movl %r15d, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $60, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $62, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $63, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: orq %r11, %r12 +; AVX512F-ONLY-NEXT: movq %r12, (%rsi) +; AVX512F-ONLY-NEXT: movzbl %r15b, %r11d +; AVX512F-ONLY-NEXT: movl %r11d, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leaq (%rax,%rax,2), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%rax,8), %rax +; AVX512F-ONLY-NEXT: movzbl %r14b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $4, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $5, %r14 +; AVX512F-ONLY-NEXT: orq %r15, %r14 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $6, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $7, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $6, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $8, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $9, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $10, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $11, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $12, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $7, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $13, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $15, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $17, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $8, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $18, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $19, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $20, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $21, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $22, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $15, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $23, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $24, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $25, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $26, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $27, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $28, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movzbl %bl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $29, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $30, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %r13 +; AVX512F-ONLY-NEXT: shlq $32, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $33, %rbx +; AVX512F-ONLY-NEXT: orq %r13, %rbx +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rbx +; AVX512F-ONLY-NEXT: shlq $35, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $37, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $12, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $38, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $39, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r12 +; AVX512F-ONLY-NEXT: shlq $40, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $41, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r13 +; AVX512F-ONLY-NEXT: shlq $42, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $43, %rbx +; AVX512F-ONLY-NEXT: orq %r13, %rbx +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rbx +; AVX512F-ONLY-NEXT: shlq $45, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rbx +; AVX512F-ONLY-NEXT: shlq $47, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $48, %r12 +; AVX512F-ONLY-NEXT: orq %rbx, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $49, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rbx +; AVX512F-ONLY-NEXT: shlq $50, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $51, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $52, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $9, %k0, %k4 +; AVX512F-ONLY-NEXT: shlq $53, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r12 +; AVX512F-ONLY-NEXT: shlq $55, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $56, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r12 +; AVX512F-ONLY-NEXT: shlq $57, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $58, %rbx +; AVX512F-ONLY-NEXT: orq %r12, %rbx +; AVX512F-ONLY-NEXT: movzbl %r15b, %eax +; AVX512F-ONLY-NEXT: # kill: def $r15d killed $r15d def $r15 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $59, %r12 +; AVX512F-ONLY-NEXT: orq %rbx, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rbx +; AVX512F-ONLY-NEXT: shlq $60, %rbx +; AVX512F-ONLY-NEXT: orq %r12, %rbx +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %rbx, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $10, %k0, %k4 +; AVX512F-ONLY-NEXT: shlq $62, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k0, %k4 +; AVX512F-ONLY-NEXT: shlq $63, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %rax, 32(%rsi) +; AVX512F-ONLY-NEXT: movzbl %r12b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movzbl %bl, %r14d +; AVX512F-ONLY-NEXT: movl %r14d, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: leaq (%r15,%r15,2), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%rax,8), %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $5, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $6, %rax +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $7, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $9, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $13, %k0, %k4 +; AVX512F-ONLY-NEXT: shlq $11, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $13, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $15, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $14, %k0, %k4 +; AVX512F-ONLY-NEXT: shlq $16, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $17, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $18, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $19, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $20, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $15, %k0, %k4 +; AVX512F-ONLY-NEXT: shlq $21, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $22, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $23, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $24, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $25, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $1, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $26, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $27, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $28, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $29, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $30, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $31, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $33, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $35, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $36, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $37, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $38, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $39, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rbp +; AVX512F-ONLY-NEXT: shlq $40, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $41, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $43, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $45, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $46, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $47, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $48, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $49, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rbp +; AVX512F-ONLY-NEXT: shlq $50, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $51, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $53, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $55, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $56, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $58, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $59, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $60, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $8, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r13d +; AVX512F-ONLY-NEXT: # kill: def $eax killed $eax def $rax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: shlq $62, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $63, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $7, %k3, %k4 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $9, %k3, %k4 +; AVX512F-ONLY-NEXT: movq %r12, 16(%rsi) +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: leaq (%r13,%r13,2), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%r13,4), %r15 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: leaq (%r15,%r13,8), %r15 +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $5, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $6, %r12 +; AVX512F-ONLY-NEXT: shlq $7, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $9, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $11, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $10, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $12, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $13, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $14, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $15, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $16, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $11, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $17, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $18, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $19, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $20, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $21, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $12, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $22, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $23, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $24, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $25, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $26, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $13, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $27, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $28, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $29, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $30, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rbp +; AVX512F-ONLY-NEXT: shlq $31, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $32, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $34, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $36, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $15, %k3, %k3 +; AVX512F-ONLY-NEXT: shlq $37, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $39, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $41, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $1, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $42, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $43, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $44, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $45, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $46, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $47, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $49, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rbp +; AVX512F-ONLY-NEXT: shlq $51, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: kmovw %k3, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k2, %k2 +; AVX512F-ONLY-NEXT: shlq $52, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $53, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $54, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $56, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k1, %k1 +; AVX512F-ONLY-NEXT: shlq $57, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $58, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $59, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $60, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $61, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k1, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $4, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $62, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k1, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $63, %r11 +; AVX512F-ONLY-NEXT: orq %r12, %r11 +; AVX512F-ONLY-NEXT: kmovw %k1, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k0, %k1 +; AVX512F-ONLY-NEXT: orq %r15, %r11 +; AVX512F-ONLY-NEXT: kmovw %k1, %r15d +; AVX512F-ONLY-NEXT: movq %r11, 24(%rsi) +; AVX512F-ONLY-NEXT: movzbl %bpl, %r11d +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: leaq (%r10,%r11,2), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%r11,4), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%r11,8), %rax +; AVX512F-ONLY-NEXT: movq %r11, %r10 +; AVX512F-ONLY-NEXT: shlq $4, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $5, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movzbl %r9b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $6, %r9 +; AVX512F-ONLY-NEXT: movq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $7, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $8, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movq %rax, %rbp +; AVX512F-ONLY-NEXT: shlq $9, %rbp +; AVX512F-ONLY-NEXT: orq %r9, %rbp +; AVX512F-ONLY-NEXT: kmovw %k0, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k0, %k0 +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %rbp, %rax +; AVX512F-ONLY-NEXT: movzbl %r8b, %r9d +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $11, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $13, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: kmovw %k0, %r8d +; AVX512F-ONLY-NEXT: shlq $15, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movzbl %r10b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $16, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $17, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $18, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $19, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: shlq $20, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movzbl %dil, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r9 +; AVX512F-ONLY-NEXT: shlq $21, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $22, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %rdi, %r9 +; AVX512F-ONLY-NEXT: shlq $23, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $24, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $25, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: movzbl %dl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $26, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $27, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $28, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $29, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $30, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: movzbl %cl, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $31, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $33, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $37, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $39, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $40, %r13 +; AVX512F-ONLY-NEXT: orq %rcx, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $41, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $42, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $43, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $44, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $45, %r12 +; AVX512F-ONLY-NEXT: orq %rcx, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rcx +; AVX512F-ONLY-NEXT: shlq $47, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rcx +; AVX512F-ONLY-NEXT: shlq $49, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $50, %r15 +; AVX512F-ONLY-NEXT: orq %rcx, %r15 +; AVX512F-ONLY-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $51, %rcx +; AVX512F-ONLY-NEXT: orq %r15, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $52, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $53, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $54, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %rcx +; AVX512F-ONLY-NEXT: shlq $56, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r8, %rcx +; AVX512F-ONLY-NEXT: shlq $58, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $59, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $60, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $61, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $62, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $63, %r14 +; AVX512F-ONLY-NEXT: orq %rbx, %r14 +; AVX512F-ONLY-NEXT: orq %r11, %r14 +; AVX512F-ONLY-NEXT: movq %r14, 8(%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r12 +; AVX512F-ONLY-NEXT: popq %r13 +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: popq %r15 +; AVX512F-ONLY-NEXT: popq %rbp +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor5_vf64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %r13 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovw (%rdi), %k1 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k3 +; AVX512DQ-NEXT: kmovw 6(%rdi), %k2 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k4 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k4 +; AVX512DQ-NEXT: kmovw %k4, %ecx +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k4 +; AVX512DQ-NEXT: kmovw %k4, %edx +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k4 +; AVX512DQ-NEXT: kmovw %k4, %edi +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r8d +; AVX512DQ-NEXT: kshiftrw $14, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r9d +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k4 +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $9, %k2, %k4 +; AVX512DQ-NEXT: kmovw %k4, %ebx +; AVX512DQ-NEXT: kshiftrw $4, %k2, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r14d +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r10d +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r11d +; AVX512DQ-NEXT: kshiftrw $2, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k1, %r15d +; AVX512DQ-NEXT: movzbl %r15b, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: leaq (%r15,%r15,2), %r12 +; AVX512DQ-NEXT: leaq (%r12,%r15,4), %r12 +; AVX512DQ-NEXT: leaq (%r12,%r15,8), %r12 +; AVX512DQ-NEXT: shlq $4, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movzbl %r11b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r11 +; AVX512DQ-NEXT: shlq $5, %r11 +; AVX512DQ-NEXT: orq %r15, %r11 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $6, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $7, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $8, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $3, %k1, %k4 +; AVX512DQ-NEXT: shlq $9, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movzbl %r13b, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $10, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $11, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $12, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $13, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $4, %k1, %k4 +; AVX512DQ-NEXT: shlq $14, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movzbl %r13b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $15, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $16, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $17, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $18, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $5, %k1, %k4 +; AVX512DQ-NEXT: shlq $19, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movzbl %r13b, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $20, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $21, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $22, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $23, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQ-NEXT: shlq $24, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movzbl %r13b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $25, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $26, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $27, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $28, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: shlq $29, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movzbl %r10b, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $30, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $7, %k1, %k4 +; AVX512DQ-NEXT: shlq $31, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r10, %r13 +; AVX512DQ-NEXT: shlq $33, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k4 +; AVX512DQ-NEXT: shlq $34, %r10 +; AVX512DQ-NEXT: orq %r13, %r10 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $35, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $36, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $37, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $38, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r10d +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k4 +; AVX512DQ-NEXT: shlq $39, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $41, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r13 +; AVX512DQ-NEXT: shlq $43, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $10, %k1, %k4 +; AVX512DQ-NEXT: shlq $44, %r10 +; AVX512DQ-NEXT: orq %r13, %r10 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $45, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $46, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $47, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $48, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r10d +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQ-NEXT: shlq $49, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $51, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r13 +; AVX512DQ-NEXT: shlq $53, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $3, %k2, %k4 +; AVX512DQ-NEXT: shlq $54, %r10 +; AVX512DQ-NEXT: orq %r13, %r10 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $56, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $58, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: shlq $59, %r12 +; AVX512DQ-NEXT: orq %r10, %r12 +; AVX512DQ-NEXT: movzbl %r15b, %r10d +; AVX512DQ-NEXT: movl %r15d, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $60, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $5, %k2, %k4 +; AVX512DQ-NEXT: shlq $62, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $63, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: orq %r11, %r12 +; AVX512DQ-NEXT: movq %r12, (%rsi) +; AVX512DQ-NEXT: movzbl %r15b, %r11d +; AVX512DQ-NEXT: movl %r11d, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leaq (%rax,%rax,2), %r15 +; AVX512DQ-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512DQ-NEXT: leaq (%r15,%rax,8), %rax +; AVX512DQ-NEXT: movzbl %r14b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $4, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $5, %r14 +; AVX512DQ-NEXT: orq %r15, %r14 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $6, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $7, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $6, %k2, %k4 +; AVX512DQ-NEXT: shlq $8, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movzbl %al, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $9, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $10, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $11, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $12, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $7, %k2, %k4 +; AVX512DQ-NEXT: shlq $13, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $15, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $17, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $8, %k2, %k4 +; AVX512DQ-NEXT: shlq $18, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movzbl %al, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $19, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $20, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $21, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $22, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $15, %k2, %k4 +; AVX512DQ-NEXT: shlq $23, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $24, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $25, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $26, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $27, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: shlq $28, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movzbl %bl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $29, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $30, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $10, %k2, %k4 +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %r13 +; AVX512DQ-NEXT: shlq $32, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $11, %k2, %k4 +; AVX512DQ-NEXT: shlq $33, %rbx +; AVX512DQ-NEXT: orq %r13, %rbx +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r12, %rbx +; AVX512DQ-NEXT: shlq $35, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $37, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %ebx +; AVX512DQ-NEXT: kshiftrw $12, %k2, %k4 +; AVX512DQ-NEXT: shlq $38, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $39, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %rbx, %r12 +; AVX512DQ-NEXT: shlq $40, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $41, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %rbx, %r13 +; AVX512DQ-NEXT: shlq $42, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $13, %k2, %k4 +; AVX512DQ-NEXT: shlq $43, %rbx +; AVX512DQ-NEXT: orq %r13, %rbx +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r12, %rbx +; AVX512DQ-NEXT: shlq $45, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r12, %rbx +; AVX512DQ-NEXT: shlq $47, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $14, %k2, %k4 +; AVX512DQ-NEXT: shlq $48, %r12 +; AVX512DQ-NEXT: orq %rbx, %r12 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $49, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %rbx +; AVX512DQ-NEXT: shlq $50, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $51, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $52, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %ebx +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k4 +; AVX512DQ-NEXT: shlq $53, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %rbx, %r12 +; AVX512DQ-NEXT: shlq $55, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $56, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %rbx, %r12 +; AVX512DQ-NEXT: shlq $57, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $58, %rbx +; AVX512DQ-NEXT: orq %r12, %rbx +; AVX512DQ-NEXT: movzbl %r15b, %eax +; AVX512DQ-NEXT: # kill: def $r15d killed $r15d def $r15 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $59, %r12 +; AVX512DQ-NEXT: orq %rbx, %r12 +; AVX512DQ-NEXT: movq %r15, %rbx +; AVX512DQ-NEXT: shlq $60, %rbx +; AVX512DQ-NEXT: orq %r12, %rbx +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %rbx, %r12 +; AVX512DQ-NEXT: kmovw %k4, %ebx +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k4 +; AVX512DQ-NEXT: shlq $62, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k4 +; AVX512DQ-NEXT: shlq $63, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %rax, 32(%rsi) +; AVX512DQ-NEXT: movzbl %r12b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movzbl %bl, %r14d +; AVX512DQ-NEXT: movl %r14d, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: leaq (%r15,%r15,2), %r15 +; AVX512DQ-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512DQ-NEXT: leaq (%r15,%rax,8), %r15 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $4, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $5, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: shlq $6, %rax +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $7, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $9, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k4 +; AVX512DQ-NEXT: shlq $11, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $13, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $15, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k4 +; AVX512DQ-NEXT: shlq $16, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $17, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $18, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $19, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $20, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k4 +; AVX512DQ-NEXT: shlq $21, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %al, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $22, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $23, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $24, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $25, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $1, %k3, %k4 +; AVX512DQ-NEXT: shlq $26, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $27, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $28, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $29, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $30, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: shlq $31, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k3, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $33, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $35, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $2, %k3, %k4 +; AVX512DQ-NEXT: shlq $36, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $37, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $38, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $39, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %rbp +; AVX512DQ-NEXT: shlq $40, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQ-NEXT: shlq $41, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $43, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $45, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQ-NEXT: shlq $46, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $47, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $48, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $49, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %rbp +; AVX512DQ-NEXT: shlq $50, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQ-NEXT: shlq $51, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $53, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $55, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQ-NEXT: shlq $56, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $58, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $59, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $60, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %al, %r13d +; AVX512DQ-NEXT: # kill: def $eax killed $eax def $rax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: shlq $62, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $63, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQ-NEXT: movq %r12, 16(%rsi) +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: leaq (%r13,%r13,2), %r15 +; AVX512DQ-NEXT: leaq (%r15,%r13,4), %r15 +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: leaq (%r15,%r13,8), %r15 +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $4, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $5, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $6, %r12 +; AVX512DQ-NEXT: shlq $7, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $9, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $11, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQ-NEXT: shlq $12, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %al, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $13, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $14, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $15, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $16, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQ-NEXT: shlq $17, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $18, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $19, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $20, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $21, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQ-NEXT: shlq $22, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %al, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $23, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $24, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $25, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $26, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQ-NEXT: shlq $27, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $28, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $29, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $30, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %rbp +; AVX512DQ-NEXT: shlq $31, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQ-NEXT: shlq $32, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $33, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $34, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $35, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $36, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $15, %k3, %k3 +; AVX512DQ-NEXT: shlq $37, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $39, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $41, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $1, %k2, %k3 +; AVX512DQ-NEXT: shlq $42, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %al, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $43, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $44, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $45, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $46, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $47, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $49, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %rbp +; AVX512DQ-NEXT: shlq $51, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: kmovw %k3, %r13d +; AVX512DQ-NEXT: kshiftrw $2, %k2, %k2 +; AVX512DQ-NEXT: shlq $52, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $53, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $54, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $56, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $13, %k1, %k1 +; AVX512DQ-NEXT: shlq $57, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $58, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $59, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $60, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $61, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k1, %ebp +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512DQ-NEXT: shlq $62, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k1, %r13d +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k1 +; AVX512DQ-NEXT: shlq $63, %r11 +; AVX512DQ-NEXT: orq %r12, %r11 +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512DQ-NEXT: orq %r15, %r11 +; AVX512DQ-NEXT: kmovw %k1, %r15d +; AVX512DQ-NEXT: movq %r11, 24(%rsi) +; AVX512DQ-NEXT: movzbl %bpl, %r11d +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: leaq (%r10,%r11,2), %rax +; AVX512DQ-NEXT: leaq (%rax,%r11,4), %rax +; AVX512DQ-NEXT: leaq (%rax,%r11,8), %rax +; AVX512DQ-NEXT: movq %r11, %r10 +; AVX512DQ-NEXT: shlq $4, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: shlq $5, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movzbl %r9b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $6, %r9 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: shlq $7, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $8, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movq %rax, %rbp +; AVX512DQ-NEXT: shlq $9, %rbp +; AVX512DQ-NEXT: orq %r9, %rbp +; AVX512DQ-NEXT: kmovw %k0, %r10d +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %rbp, %rax +; AVX512DQ-NEXT: movzbl %r8b, %r9d +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %r8 +; AVX512DQ-NEXT: shlq $11, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %r9, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %r9, %r8 +; AVX512DQ-NEXT: shlq $13, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %r9, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: kmovw %k0, %r8d +; AVX512DQ-NEXT: shlq $15, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movzbl %r10b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: shlq $16, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $17, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: shlq $18, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $19, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: shlq $20, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movzbl %dil, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r9 +; AVX512DQ-NEXT: shlq $21, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $22, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %rdi, %r9 +; AVX512DQ-NEXT: shlq $23, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $24, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: shlq $25, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: movzbl %dl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $26, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $27, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $28, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $29, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: shlq $30, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: movzbl %cl, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $31, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $33, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: shlq $35, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $37, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $39, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $40, %r13 +; AVX512DQ-NEXT: orq %rcx, %r13 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $41, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $42, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $43, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $44, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $45, %r12 +; AVX512DQ-NEXT: orq %rcx, %r12 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %rcx +; AVX512DQ-NEXT: shlq $47, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r15, %rcx +; AVX512DQ-NEXT: shlq $49, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $50, %r15 +; AVX512DQ-NEXT: orq %rcx, %r15 +; AVX512DQ-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $51, %rcx +; AVX512DQ-NEXT: orq %r15, %rcx +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $52, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $53, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $54, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %rcx +; AVX512DQ-NEXT: shlq $56, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r8, %rcx +; AVX512DQ-NEXT: shlq $58, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $59, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $60, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $61, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: shlq $62, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: shlq $63, %r14 +; AVX512DQ-NEXT: orq %rbx, %r14 +; AVX512DQ-NEXT: orq %r11, %r14 +; AVX512DQ-NEXT: movq %r14, 8(%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r13 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor5_vf64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r13 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: kshiftrq $19, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrq $18, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kshiftrq $17, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: kshiftrq $16, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrq $15, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrq $14, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrq $27, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $57, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrq $52, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrq $6, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrq $1, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrq $2, %k0, %k1 +; AVX512BW-NEXT: kmovd %k0, %r15d +; AVX512BW-NEXT: movzbl %r15b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: leaq (%r15,%r15,2), %r12 +; AVX512BW-NEXT: leaq (%r12,%r15,4), %r12 +; AVX512BW-NEXT: leaq (%r12,%r15,8), %r12 +; AVX512BW-NEXT: shlq $4, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movzbl %r11b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r11 +; AVX512BW-NEXT: shlq $5, %r11 +; AVX512BW-NEXT: orq %r15, %r11 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $6, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $7, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $8, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $3, %k0, %k1 +; AVX512BW-NEXT: shlq $9, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movzbl %r13b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $10, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $11, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $12, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $13, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $4, %k0, %k1 +; AVX512BW-NEXT: shlq $14, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movzbl %r13b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $15, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $16, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $17, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $18, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $5, %k0, %k1 +; AVX512BW-NEXT: shlq $19, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movzbl %r13b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $20, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $21, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $22, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $23, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $12, %k0, %k1 +; AVX512BW-NEXT: shlq $24, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movzbl %r13b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $25, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $26, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $27, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $28, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: shlq $29, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movzbl %r10b, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $7, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r10, %r13 +; AVX512BW-NEXT: shlq $33, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $8, %k0, %k1 +; AVX512BW-NEXT: shlq $34, %r10 +; AVX512BW-NEXT: orq %r13, %r10 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $35, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $36, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $37, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $38, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrq $9, %k0, %k1 +; AVX512BW-NEXT: shlq $39, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r10, %r13 +; AVX512BW-NEXT: shlq $43, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $10, %k0, %k1 +; AVX512BW-NEXT: shlq $44, %r10 +; AVX512BW-NEXT: orq %r13, %r10 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $45, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $46, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $47, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $48, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrq $11, %k0, %k1 +; AVX512BW-NEXT: shlq $49, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $51, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $53, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $51, %k0, %k1 +; AVX512BW-NEXT: shlq $54, %r10 +; AVX512BW-NEXT: orq %r12, %r10 +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $55, %r12 +; AVX512BW-NEXT: orq %r10, %r12 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: shlq $56, %r10 +; AVX512BW-NEXT: orq %r12, %r10 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $57, %r12 +; AVX512BW-NEXT: orq %r10, %r12 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: shlq $58, %r10 +; AVX512BW-NEXT: orq %r12, %r10 +; AVX512BW-NEXT: shlq $59, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movl %r15d, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $60, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $61, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $53, %k0, %k1 +; AVX512BW-NEXT: movzbl %r15b, %r10d +; AVX512BW-NEXT: shlq $62, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $63, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %rax, (%rsi) +; AVX512BW-NEXT: movzbl %r13b, %r11d +; AVX512BW-NEXT: movl %r11d, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %r15 +; AVX512BW-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512BW-NEXT: leaq (%r15,%rax,8), %rax +; AVX512BW-NEXT: movzbl %r14b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $4, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $5, %r14 +; AVX512BW-NEXT: orq %r15, %r14 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $7, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $54, %k0, %k1 +; AVX512BW-NEXT: shlq $8, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movzbl %al, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $9, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $10, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $11, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $12, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $55, %k0, %k1 +; AVX512BW-NEXT: shlq $13, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movzbl %al, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $15, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $17, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: shlq $18, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movzbl %al, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $19, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $20, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $21, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $22, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $63, %k0, %k1 +; AVX512BW-NEXT: shlq $23, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movzbl %al, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $24, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $25, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $26, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $27, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: shlq $28, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movzbl %bl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: shlq $29, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $30, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $58, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $32, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $59, %k0, %k1 +; AVX512BW-NEXT: shlq $33, %rbx +; AVX512BW-NEXT: orq %r13, %rbx +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r12, %rbx +; AVX512BW-NEXT: shlq $35, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $37, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 +; AVX512BW-NEXT: shlq $38, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $39, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %rbx, %r12 +; AVX512BW-NEXT: shlq $40, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $41, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $42, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $61, %k0, %k1 +; AVX512BW-NEXT: shlq $43, %rbx +; AVX512BW-NEXT: orq %r13, %rbx +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r12, %rbx +; AVX512BW-NEXT: shlq $45, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $47, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrq $62, %k0, %k1 +; AVX512BW-NEXT: shlq $48, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $49, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %rbx, %r12 +; AVX512BW-NEXT: shlq $50, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $51, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %rbx, %r12 +; AVX512BW-NEXT: shlq $52, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $25, %k0, %k1 +; AVX512BW-NEXT: shlq $53, %rbx +; AVX512BW-NEXT: orq %r12, %rbx +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $54, %r12 +; AVX512BW-NEXT: orq %rbx, %r12 +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: shlq $55, %rbx +; AVX512BW-NEXT: orq %r12, %rbx +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $56, %r12 +; AVX512BW-NEXT: orq %rbx, %r12 +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: shlq $57, %rbx +; AVX512BW-NEXT: orq %r12, %rbx +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movl %r15d, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rbx +; AVX512BW-NEXT: shlq $59, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $61, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrq $26, %k0, %k1 +; AVX512BW-NEXT: shlq $62, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $28, %k0, %k1 +; AVX512BW-NEXT: movzbl %r15b, %r15d +; AVX512BW-NEXT: shlq $63, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: orq %r14, %r15 +; AVX512BW-NEXT: movq %r15, 32(%rsi) +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movzbl %bl, %r14d +; AVX512BW-NEXT: movl %r14d, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %rax +; AVX512BW-NEXT: leaq (%rax,%r13,4), %rax +; AVX512BW-NEXT: leaq (%rax,%r13,8), %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $4, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $5, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: shlq $6, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $7, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $8, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $9, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $10, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $29, %k0, %k1 +; AVX512BW-NEXT: shlq $11, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $13, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $15, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $30, %k0, %k1 +; AVX512BW-NEXT: shlq $16, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movzbl %al, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $17, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $18, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $19, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $20, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $31, %k0, %k1 +; AVX512BW-NEXT: shlq $21, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $22, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $23, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $24, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $25, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $32, %k0, %k1 +; AVX512BW-NEXT: shlq $26, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movzbl %al, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $27, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $28, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $29, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %rbp +; AVX512BW-NEXT: shlq $30, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $33, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %r12 +; AVX512BW-NEXT: orq %rbp, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $33, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rbp +; AVX512BW-NEXT: shlq $35, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $34, %k0, %k1 +; AVX512BW-NEXT: shlq $36, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $37, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $38, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $39, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %rbp +; AVX512BW-NEXT: shlq $40, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $35, %k0, %k1 +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %rbp, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $43, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rbp +; AVX512BW-NEXT: shlq $45, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $36, %k0, %k1 +; AVX512BW-NEXT: shlq $46, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $47, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $48, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $49, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %rbp +; AVX512BW-NEXT: shlq $50, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $37, %k0, %k1 +; AVX512BW-NEXT: shlq $51, %r12 +; AVX512BW-NEXT: orq %rbp, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $53, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rbp +; AVX512BW-NEXT: shlq $55, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $38, %k0, %k1 +; AVX512BW-NEXT: shlq $56, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $57, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $58, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $59, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $60, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $40, %k0, %k1 +; AVX512BW-NEXT: shlq $61, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: # kill: def $eax killed $eax def $rax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: shlq $62, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $63, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $39, %k0, %k1 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $41, %k0, %k1 +; AVX512BW-NEXT: movq %r12, 16(%rsi) +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: leaq (%r13,%r13,2), %r15 +; AVX512BW-NEXT: leaq (%r15,%r13,4), %r15 +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: leaq (%r15,%r13,8), %r15 +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $4, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $5, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $6, %r12 +; AVX512BW-NEXT: shlq $7, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movzbl %al, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $9, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $11, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $42, %k0, %k1 +; AVX512BW-NEXT: shlq $12, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $13, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $14, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $15, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $16, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $43, %k0, %k1 +; AVX512BW-NEXT: shlq $17, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movzbl %al, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $18, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $19, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $20, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $21, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $44, %k0, %k1 +; AVX512BW-NEXT: shlq $22, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $23, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $24, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $25, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $26, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $45, %k0, %k1 +; AVX512BW-NEXT: shlq $27, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movzbl %al, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $28, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $29, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %rbp +; AVX512BW-NEXT: shlq $31, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $46, %k0, %k1 +; AVX512BW-NEXT: shlq $32, %r12 +; AVX512BW-NEXT: orq %rbp, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $33, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $34, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $35, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rbp +; AVX512BW-NEXT: shlq $36, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $47, %k0, %k1 +; AVX512BW-NEXT: shlq $37, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $39, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $41, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $48, %k0, %k1 +; AVX512BW-NEXT: shlq $42, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $43, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $44, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $45, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rbp +; AVX512BW-NEXT: shlq $46, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $49, %k0, %k1 +; AVX512BW-NEXT: shlq $47, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $49, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %rbp +; AVX512BW-NEXT: shlq $51, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $50, %k0, %k1 +; AVX512BW-NEXT: shlq $52, %r12 +; AVX512BW-NEXT: orq %rbp, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $53, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $54, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $55, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rbp +; AVX512BW-NEXT: shlq $56, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $13, %k0, %k1 +; AVX512BW-NEXT: shlq $57, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $59, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $61, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $20, %k0, %k1 +; AVX512BW-NEXT: shlq $62, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $21, %k0, %k1 +; AVX512BW-NEXT: shlq $63, %r11 +; AVX512BW-NEXT: orq %r12, %r11 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $22, %k0, %k1 +; AVX512BW-NEXT: orq %r15, %r11 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $23, %k0, %k1 +; AVX512BW-NEXT: movq %r11, 24(%rsi) +; AVX512BW-NEXT: movzbl %bpl, %r11d +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: leaq (%r10,%r11,2), %rax +; AVX512BW-NEXT: leaq (%rax,%r11,4), %rax +; AVX512BW-NEXT: leaq (%rax,%r11,8), %rax +; AVX512BW-NEXT: movq %r11, %r10 +; AVX512BW-NEXT: shlq $4, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: shlq $5, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movzbl %r9b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $6, %r9 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: shlq $7, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $8, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: shlq $9, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrq $24, %k0, %k0 +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movzbl %r8b, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %r8 +; AVX512BW-NEXT: shlq $11, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %r10, %r8 +; AVX512BW-NEXT: shlq $13, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: kmovd %k0, %r8d +; AVX512BW-NEXT: shlq $15, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movzbl %dil, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $16, %rdi +; AVX512BW-NEXT: orq %r10, %rdi +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: shlq $17, %r10 +; AVX512BW-NEXT: orq %rdi, %r10 +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $18, %rdi +; AVX512BW-NEXT: orq %r10, %rdi +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: shlq $19, %r10 +; AVX512BW-NEXT: orq %rdi, %r10 +; AVX512BW-NEXT: shlq $20, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movzbl %dl, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $21, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $22, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $23, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $24, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: shlq $25, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movzbl %cl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $26, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $27, %rdx +; AVX512BW-NEXT: orq %rcx, %rdx +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $28, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $29, %rdx +; AVX512BW-NEXT: orq %rcx, %rdx +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $31, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $33, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: shlq $35, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r13, %rcx +; AVX512BW-NEXT: shlq $37, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r13, %rcx +; AVX512BW-NEXT: shlq $39, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $40, %r13 +; AVX512BW-NEXT: orq %rcx, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $41, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $42, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $43, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $44, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $45, %r12 +; AVX512BW-NEXT: orq %rcx, %r12 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %rcx +; AVX512BW-NEXT: shlq $47, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r15, %rcx +; AVX512BW-NEXT: shlq $49, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $50, %r15 +; AVX512BW-NEXT: orq %rcx, %r15 +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $51, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r9, %rcx +; AVX512BW-NEXT: shlq $52, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $53, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r9, %rcx +; AVX512BW-NEXT: shlq $54, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $55, %r9 +; AVX512BW-NEXT: orq %rcx, %r9 +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r8, %rcx +; AVX512BW-NEXT: shlq $57, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r8, %rcx +; AVX512BW-NEXT: shlq $59, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $60, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $61, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: shlq $62, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: shlq $63, %r14 +; AVX512BW-NEXT: orq %rbx, %r14 +; AVX512BW-NEXT: orq %r11, %r14 +; AVX512BW-NEXT: movq %r14, 8(%rsi) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r13 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: retq + %src.vec = load <64 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <320 x i32> + store <320 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor6_vf2(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor6_vf2: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kshiftrw $1, %k1, %k0 +; AVX512F-ONLY-NEXT: kmovw %k1, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leal (%rax,%rax,2), %ecx +; AVX512F-ONLY-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512F-ONLY-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512F-ONLY-NEXT: movl %eax, %edx +; AVX512F-ONLY-NEXT: shll $4, %edx +; AVX512F-ONLY-NEXT: orl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $5, %eax +; AVX512F-ONLY-NEXT: orl %edx, %eax +; AVX512F-ONLY-NEXT: kmovw %k0, %ecx +; AVX512F-ONLY-NEXT: movl %ecx, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: shll $6, %edi +; AVX512F-ONLY-NEXT: movl %edx, %r8d +; AVX512F-ONLY-NEXT: shll $7, %r8d +; AVX512F-ONLY-NEXT: orl %edi, %r8d +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: shll $8, %edi +; AVX512F-ONLY-NEXT: orl %r8d, %edi +; AVX512F-ONLY-NEXT: movl %edx, %r8d +; AVX512F-ONLY-NEXT: shll $9, %r8d +; AVX512F-ONLY-NEXT: orl %edi, %r8d +; AVX512F-ONLY-NEXT: shll $10, %edx +; AVX512F-ONLY-NEXT: orl %r8d, %edx +; AVX512F-ONLY-NEXT: shll $11, %ecx +; AVX512F-ONLY-NEXT: orl %edx, %ecx +; AVX512F-ONLY-NEXT: orl %eax, %ecx +; AVX512F-ONLY-NEXT: andl $4095, %ecx # imm = 0xFFF +; AVX512F-ONLY-NEXT: movw %cx, (%rsi) +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor6_vf2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k1 +; AVX512DQ-NEXT: kshiftrb $1, %k1, %k0 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leal (%rax,%rax,2), %ecx +; AVX512DQ-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512DQ-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512DQ-NEXT: movl %eax, %edx +; AVX512DQ-NEXT: shll $4, %edx +; AVX512DQ-NEXT: orl %ecx, %edx +; AVX512DQ-NEXT: shll $5, %eax +; AVX512DQ-NEXT: orl %edx, %eax +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: movl %ecx, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: shll $6, %edi +; AVX512DQ-NEXT: movl %edx, %r8d +; AVX512DQ-NEXT: shll $7, %r8d +; AVX512DQ-NEXT: orl %edi, %r8d +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: shll $8, %edi +; AVX512DQ-NEXT: orl %r8d, %edi +; AVX512DQ-NEXT: movl %edx, %r8d +; AVX512DQ-NEXT: shll $9, %r8d +; AVX512DQ-NEXT: orl %edi, %r8d +; AVX512DQ-NEXT: shll $10, %edx +; AVX512DQ-NEXT: orl %r8d, %edx +; AVX512DQ-NEXT: shll $11, %ecx +; AVX512DQ-NEXT: orl %edx, %ecx +; AVX512DQ-NEXT: orl %eax, %ecx +; AVX512DQ-NEXT: andl $4095, %ecx # imm = 0xFFF +; AVX512DQ-NEXT: movw %cx, (%rsi) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor6_vf2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k0 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leal (%rax,%rax,2), %ecx +; AVX512BW-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512BW-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512BW-NEXT: movl %eax, %edx +; AVX512BW-NEXT: shll $4, %edx +; AVX512BW-NEXT: orl %ecx, %edx +; AVX512BW-NEXT: shll $5, %eax +; AVX512BW-NEXT: orl %edx, %eax +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: movl %ecx, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: shll $6, %edi +; AVX512BW-NEXT: movl %edx, %r8d +; AVX512BW-NEXT: shll $7, %r8d +; AVX512BW-NEXT: orl %edi, %r8d +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: shll $8, %edi +; AVX512BW-NEXT: orl %r8d, %edi +; AVX512BW-NEXT: movl %edx, %r8d +; AVX512BW-NEXT: shll $9, %r8d +; AVX512BW-NEXT: orl %edi, %r8d +; AVX512BW-NEXT: shll $10, %edx +; AVX512BW-NEXT: orl %r8d, %edx +; AVX512BW-NEXT: shll $11, %ecx +; AVX512BW-NEXT: orl %edx, %ecx +; AVX512BW-NEXT: orl %eax, %ecx +; AVX512BW-NEXT: andl $4095, %ecx # imm = 0xFFF +; AVX512BW-NEXT: movw %cx, (%rsi) +; AVX512BW-NEXT: retq + %src.vec = load <2 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <12 x i32> + store <12 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor6_vf4(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor6_vf4: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3 +; AVX512F-ONLY-NEXT: kshiftrw $3, %k3, %k0 +; AVX512F-ONLY-NEXT: kshiftrw $2, %k3, %k1 +; AVX512F-ONLY-NEXT: kshiftrw $1, %k3, %k2 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leal (%rax,%rax,2), %ecx +; AVX512F-ONLY-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512F-ONLY-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512F-ONLY-NEXT: movl %eax, %edx +; AVX512F-ONLY-NEXT: shll $4, %edx +; AVX512F-ONLY-NEXT: orl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $5, %eax +; AVX512F-ONLY-NEXT: orl %edx, %eax +; AVX512F-ONLY-NEXT: kmovw %k2, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $6, %edx +; AVX512F-ONLY-NEXT: movl %ecx, %edi +; AVX512F-ONLY-NEXT: shll $7, %edi +; AVX512F-ONLY-NEXT: orl %edx, %edi +; AVX512F-ONLY-NEXT: movl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $8, %edx +; AVX512F-ONLY-NEXT: orl %edi, %edx +; AVX512F-ONLY-NEXT: movl %ecx, %edi +; AVX512F-ONLY-NEXT: shll $9, %edi +; AVX512F-ONLY-NEXT: orl %edx, %edi +; AVX512F-ONLY-NEXT: movl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $10, %edx +; AVX512F-ONLY-NEXT: orl %edi, %edx +; AVX512F-ONLY-NEXT: shll $11, %ecx +; AVX512F-ONLY-NEXT: orl %edx, %ecx +; AVX512F-ONLY-NEXT: kmovw %k1, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: shll $12, %edi +; AVX512F-ONLY-NEXT: orl %ecx, %edi +; AVX512F-ONLY-NEXT: movl %edx, %ecx +; AVX512F-ONLY-NEXT: shll $13, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: shll $14, %edi +; AVX512F-ONLY-NEXT: orl %ecx, %edi +; AVX512F-ONLY-NEXT: movl %edx, %ecx +; AVX512F-ONLY-NEXT: shll $15, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: orl %eax, %ecx +; AVX512F-ONLY-NEXT: movw %cx, (%rsi) +; AVX512F-ONLY-NEXT: movl %edx, %eax +; AVX512F-ONLY-NEXT: shll $16, %eax +; AVX512F-ONLY-NEXT: shll $17, %edx +; AVX512F-ONLY-NEXT: orl %eax, %edx +; AVX512F-ONLY-NEXT: kmovw %k0, %eax +; AVX512F-ONLY-NEXT: movl %eax, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movl %edi, %r8d +; AVX512F-ONLY-NEXT: shll $18, %r8d +; AVX512F-ONLY-NEXT: orl %edx, %r8d +; AVX512F-ONLY-NEXT: movl %edi, %edx +; AVX512F-ONLY-NEXT: shll $19, %edx +; AVX512F-ONLY-NEXT: orl %r8d, %edx +; AVX512F-ONLY-NEXT: movl %edi, %r8d +; AVX512F-ONLY-NEXT: shll $20, %r8d +; AVX512F-ONLY-NEXT: orl %edx, %r8d +; AVX512F-ONLY-NEXT: movl %edi, %edx +; AVX512F-ONLY-NEXT: shll $21, %edx +; AVX512F-ONLY-NEXT: orl %r8d, %edx +; AVX512F-ONLY-NEXT: shll $22, %edi +; AVX512F-ONLY-NEXT: orl %edx, %edi +; AVX512F-ONLY-NEXT: shll $23, %eax +; AVX512F-ONLY-NEXT: orl %edi, %eax +; AVX512F-ONLY-NEXT: orl %ecx, %eax +; AVX512F-ONLY-NEXT: shrl $16, %eax +; AVX512F-ONLY-NEXT: movb %al, 2(%rsi) +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor6_vf4: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k3 +; AVX512DQ-NEXT: kshiftrb $3, %k3, %k0 +; AVX512DQ-NEXT: kshiftrb $2, %k3, %k1 +; AVX512DQ-NEXT: kshiftrb $1, %k3, %k2 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leal (%rax,%rax,2), %ecx +; AVX512DQ-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512DQ-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512DQ-NEXT: movl %eax, %edx +; AVX512DQ-NEXT: shll $4, %edx +; AVX512DQ-NEXT: orl %ecx, %edx +; AVX512DQ-NEXT: shll $5, %eax +; AVX512DQ-NEXT: orl %edx, %eax +; AVX512DQ-NEXT: kmovw %k2, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movl %ecx, %edx +; AVX512DQ-NEXT: shll $6, %edx +; AVX512DQ-NEXT: movl %ecx, %edi +; AVX512DQ-NEXT: shll $7, %edi +; AVX512DQ-NEXT: orl %edx, %edi +; AVX512DQ-NEXT: movl %ecx, %edx +; AVX512DQ-NEXT: shll $8, %edx +; AVX512DQ-NEXT: orl %edi, %edx +; AVX512DQ-NEXT: movl %ecx, %edi +; AVX512DQ-NEXT: shll $9, %edi +; AVX512DQ-NEXT: orl %edx, %edi +; AVX512DQ-NEXT: movl %ecx, %edx +; AVX512DQ-NEXT: shll $10, %edx +; AVX512DQ-NEXT: orl %edi, %edx +; AVX512DQ-NEXT: shll $11, %ecx +; AVX512DQ-NEXT: orl %edx, %ecx +; AVX512DQ-NEXT: kmovw %k1, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: shll $12, %edi +; AVX512DQ-NEXT: orl %ecx, %edi +; AVX512DQ-NEXT: movl %edx, %ecx +; AVX512DQ-NEXT: shll $13, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: shll $14, %edi +; AVX512DQ-NEXT: orl %ecx, %edi +; AVX512DQ-NEXT: movl %edx, %ecx +; AVX512DQ-NEXT: shll $15, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: orl %eax, %ecx +; AVX512DQ-NEXT: movw %cx, (%rsi) +; AVX512DQ-NEXT: movl %edx, %eax +; AVX512DQ-NEXT: shll $16, %eax +; AVX512DQ-NEXT: shll $17, %edx +; AVX512DQ-NEXT: orl %eax, %edx +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: movl %eax, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movl %edi, %r8d +; AVX512DQ-NEXT: shll $18, %r8d +; AVX512DQ-NEXT: orl %edx, %r8d +; AVX512DQ-NEXT: movl %edi, %edx +; AVX512DQ-NEXT: shll $19, %edx +; AVX512DQ-NEXT: orl %r8d, %edx +; AVX512DQ-NEXT: movl %edi, %r8d +; AVX512DQ-NEXT: shll $20, %r8d +; AVX512DQ-NEXT: orl %edx, %r8d +; AVX512DQ-NEXT: movl %edi, %edx +; AVX512DQ-NEXT: shll $21, %edx +; AVX512DQ-NEXT: orl %r8d, %edx +; AVX512DQ-NEXT: shll $22, %edi +; AVX512DQ-NEXT: orl %edx, %edi +; AVX512DQ-NEXT: shll $23, %eax +; AVX512DQ-NEXT: orl %edi, %eax +; AVX512DQ-NEXT: orl %ecx, %eax +; AVX512DQ-NEXT: shrl $16, %eax +; AVX512DQ-NEXT: movb %al, 2(%rsi) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor6_vf4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k3 +; AVX512BW-NEXT: kshiftrw $3, %k3, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k3, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k3, %k2 +; AVX512BW-NEXT: kmovd %k3, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leal (%rax,%rax,2), %ecx +; AVX512BW-NEXT: leal (%rcx,%rax,4), %ecx +; AVX512BW-NEXT: leal (%rcx,%rax,8), %ecx +; AVX512BW-NEXT: movl %eax, %edx +; AVX512BW-NEXT: shll $4, %edx +; AVX512BW-NEXT: orl %ecx, %edx +; AVX512BW-NEXT: shll $5, %eax +; AVX512BW-NEXT: orl %edx, %eax +; AVX512BW-NEXT: kmovd %k2, %ecx +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movl %ecx, %edx +; AVX512BW-NEXT: shll $6, %edx +; AVX512BW-NEXT: movl %ecx, %edi +; AVX512BW-NEXT: shll $7, %edi +; AVX512BW-NEXT: orl %edx, %edi +; AVX512BW-NEXT: movl %ecx, %edx +; AVX512BW-NEXT: shll $8, %edx +; AVX512BW-NEXT: orl %edi, %edx +; AVX512BW-NEXT: movl %ecx, %edi +; AVX512BW-NEXT: shll $9, %edi +; AVX512BW-NEXT: orl %edx, %edi +; AVX512BW-NEXT: movl %ecx, %edx +; AVX512BW-NEXT: shll $10, %edx +; AVX512BW-NEXT: orl %edi, %edx +; AVX512BW-NEXT: shll $11, %ecx +; AVX512BW-NEXT: orl %edx, %ecx +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: shll $12, %edi +; AVX512BW-NEXT: orl %ecx, %edi +; AVX512BW-NEXT: movl %edx, %ecx +; AVX512BW-NEXT: shll $13, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: shll $14, %edi +; AVX512BW-NEXT: orl %ecx, %edi +; AVX512BW-NEXT: movl %edx, %ecx +; AVX512BW-NEXT: shll $15, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: orl %eax, %ecx +; AVX512BW-NEXT: movw %cx, (%rsi) +; AVX512BW-NEXT: movl %edx, %eax +; AVX512BW-NEXT: shll $16, %eax +; AVX512BW-NEXT: shll $17, %edx +; AVX512BW-NEXT: orl %eax, %edx +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: movl %eax, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movl %edi, %r8d +; AVX512BW-NEXT: shll $18, %r8d +; AVX512BW-NEXT: orl %edx, %r8d +; AVX512BW-NEXT: movl %edi, %edx +; AVX512BW-NEXT: shll $19, %edx +; AVX512BW-NEXT: orl %r8d, %edx +; AVX512BW-NEXT: movl %edi, %r8d +; AVX512BW-NEXT: shll $20, %r8d +; AVX512BW-NEXT: orl %edx, %r8d +; AVX512BW-NEXT: movl %edi, %edx +; AVX512BW-NEXT: shll $21, %edx +; AVX512BW-NEXT: orl %r8d, %edx +; AVX512BW-NEXT: shll $22, %edi +; AVX512BW-NEXT: orl %edx, %edi +; AVX512BW-NEXT: shll $23, %eax +; AVX512BW-NEXT: orl %edi, %eax +; AVX512BW-NEXT: orl %ecx, %eax +; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movb %al, 2(%rsi) +; AVX512BW-NEXT: retq + %src.vec = load <4 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <24 x i32> + store <24 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor6_vf8(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor6_vf8: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kshiftrw $7, %k1, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, %eax +; AVX512F-ONLY-NEXT: kshiftrw $6, %k1, %k0 +; AVX512F-ONLY-NEXT: kshiftrw $5, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $4, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %edi +; AVX512F-ONLY-NEXT: kshiftrw $3, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r10d +; AVX512F-ONLY-NEXT: kmovw %k1, %edx +; AVX512F-ONLY-NEXT: movzbl %dl, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: leaq (%rdx,%rdx,2), %r11 +; AVX512F-ONLY-NEXT: leaq (%r11,%rdx,4), %r11 +; AVX512F-ONLY-NEXT: leaq (%r11,%rdx,8), %r11 +; AVX512F-ONLY-NEXT: movq %rdx, %rbx +; AVX512F-ONLY-NEXT: shlq $4, %rbx +; AVX512F-ONLY-NEXT: orq %r11, %rbx +; AVX512F-ONLY-NEXT: shlq $5, %rdx +; AVX512F-ONLY-NEXT: orq %rbx, %rdx +; AVX512F-ONLY-NEXT: movzbl %r10b, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %r11 +; AVX512F-ONLY-NEXT: shlq $6, %r11 +; AVX512F-ONLY-NEXT: movq %r10, %rbx +; AVX512F-ONLY-NEXT: shlq $7, %rbx +; AVX512F-ONLY-NEXT: orq %r11, %rbx +; AVX512F-ONLY-NEXT: movq %r10, %r11 +; AVX512F-ONLY-NEXT: shlq $8, %r11 +; AVX512F-ONLY-NEXT: orq %rbx, %r11 +; AVX512F-ONLY-NEXT: movq %r10, %rbx +; AVX512F-ONLY-NEXT: shlq $9, %rbx +; AVX512F-ONLY-NEXT: orq %r11, %rbx +; AVX512F-ONLY-NEXT: movq %r10, %r11 +; AVX512F-ONLY-NEXT: shlq $10, %r11 +; AVX512F-ONLY-NEXT: orq %rbx, %r11 +; AVX512F-ONLY-NEXT: shlq $11, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: movzbl %r9b, %r9d +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %r11 +; AVX512F-ONLY-NEXT: shlq $12, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $13, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: movq %r9, %r11 +; AVX512F-ONLY-NEXT: shlq $14, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $15, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: movq %r9, %r11 +; AVX512F-ONLY-NEXT: shlq $16, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: shlq $17, %r9 +; AVX512F-ONLY-NEXT: orq %r11, %r9 +; AVX512F-ONLY-NEXT: movzbl %r8b, %r8d +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %r10 +; AVX512F-ONLY-NEXT: shlq $18, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $19, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movq %r8, %r10 +; AVX512F-ONLY-NEXT: shlq $20, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $21, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movq %r8, %r10 +; AVX512F-ONLY-NEXT: shlq $22, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $23, %r8 +; AVX512F-ONLY-NEXT: orq %r10, %r8 +; AVX512F-ONLY-NEXT: movzbl %dil, %r9d +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %rdi +; AVX512F-ONLY-NEXT: shlq $24, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $25, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %r9, %rdi +; AVX512F-ONLY-NEXT: shlq $26, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $27, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %r9, %rdi +; AVX512F-ONLY-NEXT: shlq $28, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: shlq $29, %r9 +; AVX512F-ONLY-NEXT: orq %rdi, %r9 +; AVX512F-ONLY-NEXT: movzbl %cl, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $30, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $31, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movl %edi, (%rsi) +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $32, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %r8 +; AVX512F-ONLY-NEXT: shlq $33, %r8 +; AVX512F-ONLY-NEXT: orq %rdx, %r8 +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $34, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: shlq $35, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: kmovw %k0, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $36, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: movq %rdx, %rcx +; AVX512F-ONLY-NEXT: shlq $37, %rcx +; AVX512F-ONLY-NEXT: orq %r8, %rcx +; AVX512F-ONLY-NEXT: movq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $38, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: movq %rdx, %rcx +; AVX512F-ONLY-NEXT: shlq $39, %rcx +; AVX512F-ONLY-NEXT: orq %r8, %rcx +; AVX512F-ONLY-NEXT: movq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $40, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: shlq $41, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: movzbl %al, %ecx +; AVX512F-ONLY-NEXT: # kill: def $eax killed $eax def $rax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $42, %r8 +; AVX512F-ONLY-NEXT: orq %rdx, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $43, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $44, %r8 +; AVX512F-ONLY-NEXT: orq %rdx, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $45, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $47, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: shrq $32, %rcx +; AVX512F-ONLY-NEXT: movw %cx, 4(%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor6_vf8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovb (%rdi), %k1 +; AVX512DQ-NEXT: kshiftrb $7, %k1, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: kshiftrb $6, %k1, %k0 +; AVX512DQ-NEXT: kshiftrb $5, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %ecx +; AVX512DQ-NEXT: kshiftrb $4, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %edi +; AVX512DQ-NEXT: kshiftrb $3, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r8d +; AVX512DQ-NEXT: kshiftrb $2, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r9d +; AVX512DQ-NEXT: kshiftrb $1, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r10d +; AVX512DQ-NEXT: kmovw %k1, %edx +; AVX512DQ-NEXT: movzbl %dl, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: leaq (%rdx,%rdx,2), %r11 +; AVX512DQ-NEXT: leaq (%r11,%rdx,4), %r11 +; AVX512DQ-NEXT: leaq (%r11,%rdx,8), %r11 +; AVX512DQ-NEXT: movq %rdx, %rbx +; AVX512DQ-NEXT: shlq $4, %rbx +; AVX512DQ-NEXT: orq %r11, %rbx +; AVX512DQ-NEXT: shlq $5, %rdx +; AVX512DQ-NEXT: orq %rbx, %rdx +; AVX512DQ-NEXT: movzbl %r10b, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %r11 +; AVX512DQ-NEXT: shlq $6, %r11 +; AVX512DQ-NEXT: movq %r10, %rbx +; AVX512DQ-NEXT: shlq $7, %rbx +; AVX512DQ-NEXT: orq %r11, %rbx +; AVX512DQ-NEXT: movq %r10, %r11 +; AVX512DQ-NEXT: shlq $8, %r11 +; AVX512DQ-NEXT: orq %rbx, %r11 +; AVX512DQ-NEXT: movq %r10, %rbx +; AVX512DQ-NEXT: shlq $9, %rbx +; AVX512DQ-NEXT: orq %r11, %rbx +; AVX512DQ-NEXT: movq %r10, %r11 +; AVX512DQ-NEXT: shlq $10, %r11 +; AVX512DQ-NEXT: orq %rbx, %r11 +; AVX512DQ-NEXT: shlq $11, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: movzbl %r9b, %r9d +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %r11 +; AVX512DQ-NEXT: shlq $12, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movq %r9, %r10 +; AVX512DQ-NEXT: shlq $13, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: movq %r9, %r11 +; AVX512DQ-NEXT: shlq $14, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movq %r9, %r10 +; AVX512DQ-NEXT: shlq $15, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: movq %r9, %r11 +; AVX512DQ-NEXT: shlq $16, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: shlq $17, %r9 +; AVX512DQ-NEXT: orq %r11, %r9 +; AVX512DQ-NEXT: movzbl %r8b, %r8d +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %r10 +; AVX512DQ-NEXT: shlq $18, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $19, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movq %r8, %r10 +; AVX512DQ-NEXT: shlq $20, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $21, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movq %r8, %r10 +; AVX512DQ-NEXT: shlq $22, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: shlq $23, %r8 +; AVX512DQ-NEXT: orq %r10, %r8 +; AVX512DQ-NEXT: movzbl %dil, %r9d +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %rdi +; AVX512DQ-NEXT: shlq $24, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %r9, %r8 +; AVX512DQ-NEXT: shlq $25, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %r9, %rdi +; AVX512DQ-NEXT: shlq $26, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %r9, %r8 +; AVX512DQ-NEXT: shlq $27, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %r9, %rdi +; AVX512DQ-NEXT: shlq $28, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: shlq $29, %r9 +; AVX512DQ-NEXT: orq %rdi, %r9 +; AVX512DQ-NEXT: movzbl %cl, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $30, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: shlq $31, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movl %edi, (%rsi) +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $32, %rdx +; AVX512DQ-NEXT: movq %rcx, %r8 +; AVX512DQ-NEXT: shlq $33, %r8 +; AVX512DQ-NEXT: orq %rdx, %r8 +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $34, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: shlq $35, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: kmovw %k0, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movq %rdx, %r8 +; AVX512DQ-NEXT: shlq $36, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: movq %rdx, %rcx +; AVX512DQ-NEXT: shlq $37, %rcx +; AVX512DQ-NEXT: orq %r8, %rcx +; AVX512DQ-NEXT: movq %rdx, %r8 +; AVX512DQ-NEXT: shlq $38, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: movq %rdx, %rcx +; AVX512DQ-NEXT: shlq $39, %rcx +; AVX512DQ-NEXT: orq %r8, %rcx +; AVX512DQ-NEXT: movq %rdx, %r8 +; AVX512DQ-NEXT: shlq $40, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: shlq $41, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: movzbl %al, %ecx +; AVX512DQ-NEXT: # kill: def $eax killed $eax def $rax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $42, %r8 +; AVX512DQ-NEXT: orq %rdx, %r8 +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $43, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $44, %r8 +; AVX512DQ-NEXT: orq %rdx, %r8 +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $45, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: shlq $47, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: shrq $32, %rcx +; AVX512DQ-NEXT: movw %cx, 4(%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor6_vf8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovw (%rdi), %k1 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: kshiftrw $6, %k1, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k2 +; AVX512BW-NEXT: kmovd %k2, %ecx +; AVX512BW-NEXT: kshiftrw $4, %k1, %k2 +; AVX512BW-NEXT: kmovd %k2, %edi +; AVX512BW-NEXT: kshiftrw $3, %k1, %k2 +; AVX512BW-NEXT: kmovd %k2, %r8d +; AVX512BW-NEXT: kshiftrw $2, %k1, %k2 +; AVX512BW-NEXT: kmovd %k2, %r9d +; AVX512BW-NEXT: kshiftrw $1, %k1, %k2 +; AVX512BW-NEXT: kmovd %k2, %r10d +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: movzbl %dl, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: leaq (%rdx,%rdx,2), %r11 +; AVX512BW-NEXT: leaq (%r11,%rdx,4), %r11 +; AVX512BW-NEXT: leaq (%r11,%rdx,8), %r11 +; AVX512BW-NEXT: movq %rdx, %rbx +; AVX512BW-NEXT: shlq $4, %rbx +; AVX512BW-NEXT: orq %r11, %rbx +; AVX512BW-NEXT: shlq $5, %rdx +; AVX512BW-NEXT: orq %rbx, %rdx +; AVX512BW-NEXT: movzbl %r10b, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %r11 +; AVX512BW-NEXT: shlq $6, %r11 +; AVX512BW-NEXT: movq %r10, %rbx +; AVX512BW-NEXT: shlq $7, %rbx +; AVX512BW-NEXT: orq %r11, %rbx +; AVX512BW-NEXT: movq %r10, %r11 +; AVX512BW-NEXT: shlq $8, %r11 +; AVX512BW-NEXT: orq %rbx, %r11 +; AVX512BW-NEXT: movq %r10, %rbx +; AVX512BW-NEXT: shlq $9, %rbx +; AVX512BW-NEXT: orq %r11, %rbx +; AVX512BW-NEXT: movq %r10, %r11 +; AVX512BW-NEXT: shlq $10, %r11 +; AVX512BW-NEXT: orq %rbx, %r11 +; AVX512BW-NEXT: shlq $11, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: movzbl %r9b, %r9d +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %r11 +; AVX512BW-NEXT: shlq $12, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: shlq $13, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: movq %r9, %r11 +; AVX512BW-NEXT: shlq $14, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: shlq $15, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: movq %r9, %r11 +; AVX512BW-NEXT: shlq $16, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: shlq $17, %r9 +; AVX512BW-NEXT: orq %r11, %r9 +; AVX512BW-NEXT: movzbl %r8b, %r8d +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: movq %r8, %r10 +; AVX512BW-NEXT: shlq $18, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %r8, %r9 +; AVX512BW-NEXT: shlq $19, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movq %r8, %r10 +; AVX512BW-NEXT: shlq $20, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %r8, %r9 +; AVX512BW-NEXT: shlq $21, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movq %r8, %r10 +; AVX512BW-NEXT: shlq $22, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: shlq $23, %r8 +; AVX512BW-NEXT: orq %r10, %r8 +; AVX512BW-NEXT: movzbl %dil, %r9d +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %rdi +; AVX512BW-NEXT: shlq $24, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: movq %r9, %r8 +; AVX512BW-NEXT: shlq $25, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: movq %r9, %rdi +; AVX512BW-NEXT: shlq $26, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: movq %r9, %r8 +; AVX512BW-NEXT: shlq $27, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: movq %r9, %rdi +; AVX512BW-NEXT: shlq $28, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: shlq $29, %r9 +; AVX512BW-NEXT: orq %rdi, %r9 +; AVX512BW-NEXT: movzbl %cl, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $30, %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: shlq $31, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movl %edi, (%rsi) +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $32, %rdx +; AVX512BW-NEXT: movq %rcx, %r8 +; AVX512BW-NEXT: shlq $33, %r8 +; AVX512BW-NEXT: orq %rdx, %r8 +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $34, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: shlq $35, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: kmovd %k0, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %r8 +; AVX512BW-NEXT: shlq $36, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: shlq $37, %rcx +; AVX512BW-NEXT: orq %r8, %rcx +; AVX512BW-NEXT: movq %rdx, %r8 +; AVX512BW-NEXT: shlq $38, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: shlq $39, %rcx +; AVX512BW-NEXT: orq %r8, %rcx +; AVX512BW-NEXT: movq %rdx, %r8 +; AVX512BW-NEXT: shlq $40, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: shlq $41, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: movzbl %al, %ecx +; AVX512BW-NEXT: # kill: def $eax killed $eax def $rax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $42, %r8 +; AVX512BW-NEXT: orq %rdx, %r8 +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $43, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $44, %r8 +; AVX512BW-NEXT: orq %rdx, %r8 +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $45, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: shlq $47, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: shrq $32, %rcx +; AVX512BW-NEXT: movw %cx, 4(%rsi) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: retq + %src.vec = load <8 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <48 x i32> + store <48 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor6_vf16(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor6_vf16: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbp +; AVX512F-ONLY-NEXT: pushq %r15 +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %r13 +; AVX512F-ONLY-NEXT: pushq %r12 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512F-ONLY-NEXT: kshiftrw $10, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %edi +; AVX512F-ONLY-NEXT: kshiftrw $5, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $4, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %eax +; AVX512F-ONLY-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-ONLY-NEXT: kshiftrw $3, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %edx +; AVX512F-ONLY-NEXT: kshiftrw $2, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $14, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $15, %k0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k0, %k1 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movl %edi, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: leal (%r10,%r10,2), %ebx +; AVX512F-ONLY-NEXT: leal (%rbx,%r12,4), %ebx +; AVX512F-ONLY-NEXT: leal (%rbx,%r12,8), %ebx +; AVX512F-ONLY-NEXT: movl %r12d, %r13d +; AVX512F-ONLY-NEXT: shll $4, %r13d +; AVX512F-ONLY-NEXT: orl %ebx, %r13d +; AVX512F-ONLY-NEXT: movl %r12d, %ebx +; AVX512F-ONLY-NEXT: shll $5, %ebx +; AVX512F-ONLY-NEXT: orl %r13d, %ebx +; AVX512F-ONLY-NEXT: movl %r12d, %r13d +; AVX512F-ONLY-NEXT: shll $6, %r13d +; AVX512F-ONLY-NEXT: shll $7, %r12d +; AVX512F-ONLY-NEXT: orl %r13d, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movl %r14d, %r13d +; AVX512F-ONLY-NEXT: shll $8, %r13d +; AVX512F-ONLY-NEXT: orl %r12d, %r13d +; AVX512F-ONLY-NEXT: movl %r14d, %r12d +; AVX512F-ONLY-NEXT: shll $9, %r12d +; AVX512F-ONLY-NEXT: orl %r13d, %r12d +; AVX512F-ONLY-NEXT: movl %r14d, %r13d +; AVX512F-ONLY-NEXT: shll $10, %r13d +; AVX512F-ONLY-NEXT: orl %r12d, %r13d +; AVX512F-ONLY-NEXT: movl %r14d, %r12d +; AVX512F-ONLY-NEXT: shll $11, %r12d +; AVX512F-ONLY-NEXT: orl %r13d, %r12d +; AVX512F-ONLY-NEXT: movl %r14d, %r13d +; AVX512F-ONLY-NEXT: shll $12, %r13d +; AVX512F-ONLY-NEXT: orl %r12d, %r13d +; AVX512F-ONLY-NEXT: kmovw %k1, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $7, %k0, %k1 +; AVX512F-ONLY-NEXT: shll $13, %r14d +; AVX512F-ONLY-NEXT: orl %r13d, %r14d +; AVX512F-ONLY-NEXT: andl $1, %ebp +; AVX512F-ONLY-NEXT: movl %ebp, %r13d +; AVX512F-ONLY-NEXT: shll $14, %r13d +; AVX512F-ONLY-NEXT: orl %r14d, %r13d +; AVX512F-ONLY-NEXT: movl %ebp, %r14d +; AVX512F-ONLY-NEXT: shll $15, %r14d +; AVX512F-ONLY-NEXT: orl %r13d, %r14d +; AVX512F-ONLY-NEXT: movl %ebp, %r13d +; AVX512F-ONLY-NEXT: shll $16, %r13d +; AVX512F-ONLY-NEXT: orl %r14d, %r13d +; AVX512F-ONLY-NEXT: movl %ebp, %r14d +; AVX512F-ONLY-NEXT: shll $17, %r14d +; AVX512F-ONLY-NEXT: orl %r13d, %r14d +; AVX512F-ONLY-NEXT: movl %ebp, %r13d +; AVX512F-ONLY-NEXT: shll $18, %r13d +; AVX512F-ONLY-NEXT: orl %r14d, %r13d +; AVX512F-ONLY-NEXT: kmovw %k1, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-ONLY-NEXT: shll $19, %ebp +; AVX512F-ONLY-NEXT: orl %r13d, %ebp +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movl %r15d, %r13d +; AVX512F-ONLY-NEXT: shll $20, %r13d +; AVX512F-ONLY-NEXT: orl %ebp, %r13d +; AVX512F-ONLY-NEXT: movl %r15d, %ebp +; AVX512F-ONLY-NEXT: shll $21, %ebp +; AVX512F-ONLY-NEXT: orl %r13d, %ebp +; AVX512F-ONLY-NEXT: movl %r15d, %r13d +; AVX512F-ONLY-NEXT: shll $22, %r13d +; AVX512F-ONLY-NEXT: orl %ebp, %r13d +; AVX512F-ONLY-NEXT: movl %r15d, %ebp +; AVX512F-ONLY-NEXT: shll $23, %ebp +; AVX512F-ONLY-NEXT: orl %r13d, %ebp +; AVX512F-ONLY-NEXT: movl %r15d, %r13d +; AVX512F-ONLY-NEXT: shll $24, %r13d +; AVX512F-ONLY-NEXT: orl %ebp, %r13d +; AVX512F-ONLY-NEXT: shll $25, %r15d +; AVX512F-ONLY-NEXT: orl %r13d, %r15d +; AVX512F-ONLY-NEXT: movl %r11d, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movl %r13d, %ebp +; AVX512F-ONLY-NEXT: shll $26, %ebp +; AVX512F-ONLY-NEXT: orl %r15d, %ebp +; AVX512F-ONLY-NEXT: movl %r13d, %r15d +; AVX512F-ONLY-NEXT: shll $27, %r15d +; AVX512F-ONLY-NEXT: orl %ebp, %r15d +; AVX512F-ONLY-NEXT: movl %r13d, %ebp +; AVX512F-ONLY-NEXT: shll $28, %ebp +; AVX512F-ONLY-NEXT: orl %r15d, %ebp +; AVX512F-ONLY-NEXT: movl %r13d, %eax +; AVX512F-ONLY-NEXT: shll $29, %eax +; AVX512F-ONLY-NEXT: orl %ebp, %eax +; AVX512F-ONLY-NEXT: kmovw %k1, %r15d +; AVX512F-ONLY-NEXT: shll $30, %r13d +; AVX512F-ONLY-NEXT: orl %eax, %r13d +; AVX512F-ONLY-NEXT: kmovw %k0, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $9, %k0, %k0 +; AVX512F-ONLY-NEXT: shll $31, %r11d +; AVX512F-ONLY-NEXT: orl %r13d, %r11d +; AVX512F-ONLY-NEXT: kmovw %k0, %r13d +; AVX512F-ONLY-NEXT: movzbl %dil, %edi +; AVX512F-ONLY-NEXT: orl %ebx, %r11d +; AVX512F-ONLY-NEXT: movl %r11d, 8(%rsi) +; AVX512F-ONLY-NEXT: movzbl %bpl, %r11d +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: leaq (%r11,%r11,2), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%r11,4), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%r11,8), %rax +; AVX512F-ONLY-NEXT: movq %r11, %rbx +; AVX512F-ONLY-NEXT: shlq $4, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $5, %r11 +; AVX512F-ONLY-NEXT: orq %rbx, %r11 +; AVX512F-ONLY-NEXT: movzbl %r9b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $6, %r9 +; AVX512F-ONLY-NEXT: movq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $7, %rbx +; AVX512F-ONLY-NEXT: orq %r9, %rbx +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $8, %r9 +; AVX512F-ONLY-NEXT: orq %rbx, %r9 +; AVX512F-ONLY-NEXT: movq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $9, %rbx +; AVX512F-ONLY-NEXT: orq %r9, %rbx +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $10, %r9 +; AVX512F-ONLY-NEXT: orq %rbx, %r9 +; AVX512F-ONLY-NEXT: shlq $11, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movzbl %r8b, %r8d +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $12, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $13, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $14, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $15, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $16, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $17, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: movzbl %dl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $18, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $19, %r8 +; AVX512F-ONLY-NEXT: orq %rdx, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $20, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $21, %r8 +; AVX512F-ONLY-NEXT: orq %rdx, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $22, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: shlq $23, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $24, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $25, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $26, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $27, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $28, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $29, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: movzbl %cl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $30, %r8 +; AVX512F-ONLY-NEXT: orq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $32, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $34, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $35, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $37, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $39, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $41, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r14, %rcx +; AVX512F-ONLY-NEXT: shlq $43, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r14, %rcx +; AVX512F-ONLY-NEXT: shlq $45, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $47, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rcx +; AVX512F-ONLY-NEXT: shlq $49, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rcx +; AVX512F-ONLY-NEXT: shlq $51, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $53, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $55, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $56, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $57, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $58, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $59, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $60, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r10, %rcx +; AVX512F-ONLY-NEXT: shlq $61, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $62, %r10 +; AVX512F-ONLY-NEXT: orq %rcx, %r10 +; AVX512F-ONLY-NEXT: shlq $63, %rdi +; AVX512F-ONLY-NEXT: orq %r10, %rdi +; AVX512F-ONLY-NEXT: orq %r11, %rdi +; AVX512F-ONLY-NEXT: movq %rdi, (%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r12 +; AVX512F-ONLY-NEXT: popq %r13 +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: popq %r15 +; AVX512F-ONLY-NEXT: popq %rbp +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor6_vf16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %r13 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %edi +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %edx +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r8d +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r9d +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r14d +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ebp +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r15d +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r11d +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movl %edi, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: leal (%r10,%r10,2), %ebx +; AVX512DQ-NEXT: leal (%rbx,%r12,4), %ebx +; AVX512DQ-NEXT: leal (%rbx,%r12,8), %ebx +; AVX512DQ-NEXT: movl %r12d, %r13d +; AVX512DQ-NEXT: shll $4, %r13d +; AVX512DQ-NEXT: orl %ebx, %r13d +; AVX512DQ-NEXT: movl %r12d, %ebx +; AVX512DQ-NEXT: shll $5, %ebx +; AVX512DQ-NEXT: orl %r13d, %ebx +; AVX512DQ-NEXT: movl %r12d, %r13d +; AVX512DQ-NEXT: shll $6, %r13d +; AVX512DQ-NEXT: shll $7, %r12d +; AVX512DQ-NEXT: orl %r13d, %r12d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movl %r14d, %r13d +; AVX512DQ-NEXT: shll $8, %r13d +; AVX512DQ-NEXT: orl %r12d, %r13d +; AVX512DQ-NEXT: movl %r14d, %r12d +; AVX512DQ-NEXT: shll $9, %r12d +; AVX512DQ-NEXT: orl %r13d, %r12d +; AVX512DQ-NEXT: movl %r14d, %r13d +; AVX512DQ-NEXT: shll $10, %r13d +; AVX512DQ-NEXT: orl %r12d, %r13d +; AVX512DQ-NEXT: movl %r14d, %r12d +; AVX512DQ-NEXT: shll $11, %r12d +; AVX512DQ-NEXT: orl %r13d, %r12d +; AVX512DQ-NEXT: movl %r14d, %r13d +; AVX512DQ-NEXT: shll $12, %r13d +; AVX512DQ-NEXT: orl %r12d, %r13d +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k1 +; AVX512DQ-NEXT: shll $13, %r14d +; AVX512DQ-NEXT: orl %r13d, %r14d +; AVX512DQ-NEXT: andl $1, %ebp +; AVX512DQ-NEXT: movl %ebp, %r13d +; AVX512DQ-NEXT: shll $14, %r13d +; AVX512DQ-NEXT: orl %r14d, %r13d +; AVX512DQ-NEXT: movl %ebp, %r14d +; AVX512DQ-NEXT: shll $15, %r14d +; AVX512DQ-NEXT: orl %r13d, %r14d +; AVX512DQ-NEXT: movl %ebp, %r13d +; AVX512DQ-NEXT: shll $16, %r13d +; AVX512DQ-NEXT: orl %r14d, %r13d +; AVX512DQ-NEXT: movl %ebp, %r14d +; AVX512DQ-NEXT: shll $17, %r14d +; AVX512DQ-NEXT: orl %r13d, %r14d +; AVX512DQ-NEXT: movl %ebp, %r13d +; AVX512DQ-NEXT: shll $18, %r13d +; AVX512DQ-NEXT: orl %r14d, %r13d +; AVX512DQ-NEXT: kmovw %k1, %r14d +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512DQ-NEXT: shll $19, %ebp +; AVX512DQ-NEXT: orl %r13d, %ebp +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movl %r15d, %r13d +; AVX512DQ-NEXT: shll $20, %r13d +; AVX512DQ-NEXT: orl %ebp, %r13d +; AVX512DQ-NEXT: movl %r15d, %ebp +; AVX512DQ-NEXT: shll $21, %ebp +; AVX512DQ-NEXT: orl %r13d, %ebp +; AVX512DQ-NEXT: movl %r15d, %r13d +; AVX512DQ-NEXT: shll $22, %r13d +; AVX512DQ-NEXT: orl %ebp, %r13d +; AVX512DQ-NEXT: movl %r15d, %ebp +; AVX512DQ-NEXT: shll $23, %ebp +; AVX512DQ-NEXT: orl %r13d, %ebp +; AVX512DQ-NEXT: movl %r15d, %r13d +; AVX512DQ-NEXT: shll $24, %r13d +; AVX512DQ-NEXT: orl %ebp, %r13d +; AVX512DQ-NEXT: shll $25, %r15d +; AVX512DQ-NEXT: orl %r13d, %r15d +; AVX512DQ-NEXT: movl %r11d, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movl %r13d, %ebp +; AVX512DQ-NEXT: shll $26, %ebp +; AVX512DQ-NEXT: orl %r15d, %ebp +; AVX512DQ-NEXT: movl %r13d, %r15d +; AVX512DQ-NEXT: shll $27, %r15d +; AVX512DQ-NEXT: orl %ebp, %r15d +; AVX512DQ-NEXT: movl %r13d, %ebp +; AVX512DQ-NEXT: shll $28, %ebp +; AVX512DQ-NEXT: orl %r15d, %ebp +; AVX512DQ-NEXT: movl %r13d, %eax +; AVX512DQ-NEXT: shll $29, %eax +; AVX512DQ-NEXT: orl %ebp, %eax +; AVX512DQ-NEXT: kmovw %k1, %r15d +; AVX512DQ-NEXT: shll $30, %r13d +; AVX512DQ-NEXT: orl %eax, %r13d +; AVX512DQ-NEXT: kmovw %k0, %ebp +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 +; AVX512DQ-NEXT: shll $31, %r11d +; AVX512DQ-NEXT: orl %r13d, %r11d +; AVX512DQ-NEXT: kmovw %k0, %r13d +; AVX512DQ-NEXT: movzbl %dil, %edi +; AVX512DQ-NEXT: orl %ebx, %r11d +; AVX512DQ-NEXT: movl %r11d, 8(%rsi) +; AVX512DQ-NEXT: movzbl %bpl, %r11d +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: leaq (%r11,%r11,2), %rax +; AVX512DQ-NEXT: leaq (%rax,%r11,4), %rax +; AVX512DQ-NEXT: leaq (%rax,%r11,8), %rax +; AVX512DQ-NEXT: movq %r11, %rbx +; AVX512DQ-NEXT: shlq $4, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: shlq $5, %r11 +; AVX512DQ-NEXT: orq %rbx, %r11 +; AVX512DQ-NEXT: movzbl %r9b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $6, %r9 +; AVX512DQ-NEXT: movq %rax, %rbx +; AVX512DQ-NEXT: shlq $7, %rbx +; AVX512DQ-NEXT: orq %r9, %rbx +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $8, %r9 +; AVX512DQ-NEXT: orq %rbx, %r9 +; AVX512DQ-NEXT: movq %rax, %rbx +; AVX512DQ-NEXT: shlq $9, %rbx +; AVX512DQ-NEXT: orq %r9, %rbx +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $10, %r9 +; AVX512DQ-NEXT: orq %rbx, %r9 +; AVX512DQ-NEXT: shlq $11, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movzbl %r8b, %r8d +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $12, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $13, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $14, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $15, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $16, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: shlq $17, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: movzbl %dl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $18, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $19, %r8 +; AVX512DQ-NEXT: orq %rdx, %r8 +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $20, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $21, %r8 +; AVX512DQ-NEXT: orq %rdx, %r8 +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $22, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: shlq $23, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movq %rdx, %r8 +; AVX512DQ-NEXT: shlq $24, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %rdx, %rax +; AVX512DQ-NEXT: shlq $25, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %rdx, %r8 +; AVX512DQ-NEXT: shlq $26, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %rdx, %rax +; AVX512DQ-NEXT: shlq $27, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %rdx, %r8 +; AVX512DQ-NEXT: shlq $28, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: shlq $29, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: movzbl %cl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $30, %r8 +; AVX512DQ-NEXT: orq %rdx, %r8 +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $32, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shlq $33, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $34, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: shlq $35, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $37, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $39, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $41, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r14, %rcx +; AVX512DQ-NEXT: shlq $43, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r14, %rcx +; AVX512DQ-NEXT: shlq $45, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $47, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r15, %rcx +; AVX512DQ-NEXT: shlq $49, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r15, %rcx +; AVX512DQ-NEXT: shlq $51, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $53, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $55, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $56, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $57, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $58, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $59, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $60, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r10, %rcx +; AVX512DQ-NEXT: shlq $61, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $62, %r10 +; AVX512DQ-NEXT: orq %rcx, %r10 +; AVX512DQ-NEXT: shlq $63, %rdi +; AVX512DQ-NEXT: orq %r10, %rdi +; AVX512DQ-NEXT: orq %r11, %rdi +; AVX512DQ-NEXT: movq %rdi, (%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r13 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor6_vf16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r13 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: kshiftrw $10, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrw $5, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kshiftrw $4, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrw $3, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: kshiftrw $2, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrw $1, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrw $11, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrw $13, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrw $14, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrw $15, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrw $6, %k0, %k1 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movl %edi, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: leal (%r10,%r10,2), %ebx +; AVX512BW-NEXT: leal (%rbx,%r12,4), %ebx +; AVX512BW-NEXT: leal (%rbx,%r12,8), %ebx +; AVX512BW-NEXT: movl %r12d, %r13d +; AVX512BW-NEXT: shll $4, %r13d +; AVX512BW-NEXT: orl %ebx, %r13d +; AVX512BW-NEXT: movl %r12d, %ebx +; AVX512BW-NEXT: shll $5, %ebx +; AVX512BW-NEXT: orl %r13d, %ebx +; AVX512BW-NEXT: movl %r12d, %r13d +; AVX512BW-NEXT: shll $6, %r13d +; AVX512BW-NEXT: shll $7, %r12d +; AVX512BW-NEXT: orl %r13d, %r12d +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movl %r14d, %r13d +; AVX512BW-NEXT: shll $8, %r13d +; AVX512BW-NEXT: orl %r12d, %r13d +; AVX512BW-NEXT: movl %r14d, %r12d +; AVX512BW-NEXT: shll $9, %r12d +; AVX512BW-NEXT: orl %r13d, %r12d +; AVX512BW-NEXT: movl %r14d, %r13d +; AVX512BW-NEXT: shll $10, %r13d +; AVX512BW-NEXT: orl %r12d, %r13d +; AVX512BW-NEXT: movl %r14d, %r12d +; AVX512BW-NEXT: shll $11, %r12d +; AVX512BW-NEXT: orl %r13d, %r12d +; AVX512BW-NEXT: movl %r14d, %r13d +; AVX512BW-NEXT: shll $12, %r13d +; AVX512BW-NEXT: orl %r12d, %r13d +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrw $7, %k0, %k1 +; AVX512BW-NEXT: shll $13, %r14d +; AVX512BW-NEXT: orl %r13d, %r14d +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movl %ebp, %r13d +; AVX512BW-NEXT: shll $14, %r13d +; AVX512BW-NEXT: orl %r14d, %r13d +; AVX512BW-NEXT: movl %ebp, %r14d +; AVX512BW-NEXT: shll $15, %r14d +; AVX512BW-NEXT: orl %r13d, %r14d +; AVX512BW-NEXT: movl %ebp, %r13d +; AVX512BW-NEXT: shll $16, %r13d +; AVX512BW-NEXT: orl %r14d, %r13d +; AVX512BW-NEXT: movl %ebp, %r14d +; AVX512BW-NEXT: shll $17, %r14d +; AVX512BW-NEXT: orl %r13d, %r14d +; AVX512BW-NEXT: movl %ebp, %r13d +; AVX512BW-NEXT: shll $18, %r13d +; AVX512BW-NEXT: orl %r14d, %r13d +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 +; AVX512BW-NEXT: shll $19, %ebp +; AVX512BW-NEXT: orl %r13d, %ebp +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movl %r15d, %r13d +; AVX512BW-NEXT: shll $20, %r13d +; AVX512BW-NEXT: orl %ebp, %r13d +; AVX512BW-NEXT: movl %r15d, %ebp +; AVX512BW-NEXT: shll $21, %ebp +; AVX512BW-NEXT: orl %r13d, %ebp +; AVX512BW-NEXT: movl %r15d, %r13d +; AVX512BW-NEXT: shll $22, %r13d +; AVX512BW-NEXT: orl %ebp, %r13d +; AVX512BW-NEXT: movl %r15d, %ebp +; AVX512BW-NEXT: shll $23, %ebp +; AVX512BW-NEXT: orl %r13d, %ebp +; AVX512BW-NEXT: movl %r15d, %r13d +; AVX512BW-NEXT: shll $24, %r13d +; AVX512BW-NEXT: orl %ebp, %r13d +; AVX512BW-NEXT: shll $25, %r15d +; AVX512BW-NEXT: orl %r13d, %r15d +; AVX512BW-NEXT: movl %r11d, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movl %r13d, %ebp +; AVX512BW-NEXT: shll $26, %ebp +; AVX512BW-NEXT: orl %r15d, %ebp +; AVX512BW-NEXT: movl %r13d, %r15d +; AVX512BW-NEXT: shll $27, %r15d +; AVX512BW-NEXT: orl %ebp, %r15d +; AVX512BW-NEXT: movl %r13d, %ebp +; AVX512BW-NEXT: shll $28, %ebp +; AVX512BW-NEXT: orl %r15d, %ebp +; AVX512BW-NEXT: movl %r13d, %eax +; AVX512BW-NEXT: shll $29, %eax +; AVX512BW-NEXT: orl %ebp, %eax +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: shll $30, %r13d +; AVX512BW-NEXT: orl %eax, %r13d +; AVX512BW-NEXT: kmovd %k0, %ebp +; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 +; AVX512BW-NEXT: shll $31, %r11d +; AVX512BW-NEXT: orl %r13d, %r11d +; AVX512BW-NEXT: kmovd %k0, %r13d +; AVX512BW-NEXT: movzbl %dil, %edi +; AVX512BW-NEXT: orl %ebx, %r11d +; AVX512BW-NEXT: movl %r11d, 8(%rsi) +; AVX512BW-NEXT: movzbl %bpl, %r11d +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: leaq (%r11,%r11,2), %rax +; AVX512BW-NEXT: leaq (%rax,%r11,4), %rax +; AVX512BW-NEXT: leaq (%rax,%r11,8), %rax +; AVX512BW-NEXT: movq %r11, %rbx +; AVX512BW-NEXT: shlq $4, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: shlq $5, %r11 +; AVX512BW-NEXT: orq %rbx, %r11 +; AVX512BW-NEXT: movzbl %r9b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $6, %r9 +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: shlq $7, %rbx +; AVX512BW-NEXT: orq %r9, %rbx +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $8, %r9 +; AVX512BW-NEXT: orq %rbx, %r9 +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: shlq $9, %rbx +; AVX512BW-NEXT: orq %r9, %rbx +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $10, %r9 +; AVX512BW-NEXT: orq %rbx, %r9 +; AVX512BW-NEXT: shlq $11, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movzbl %r8b, %r8d +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: movq %r8, %r9 +; AVX512BW-NEXT: shlq $12, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $13, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r8, %r9 +; AVX512BW-NEXT: shlq $14, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $15, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r8, %r9 +; AVX512BW-NEXT: shlq $16, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: shlq $17, %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: movzbl %dl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $18, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $19, %r8 +; AVX512BW-NEXT: orq %rdx, %r8 +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $20, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $21, %r8 +; AVX512BW-NEXT: orq %rdx, %r8 +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $22, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: shlq $23, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %r8 +; AVX512BW-NEXT: shlq $24, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $25, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rdx, %r8 +; AVX512BW-NEXT: shlq $26, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $27, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rdx, %r8 +; AVX512BW-NEXT: shlq $28, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: shlq $29, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: movzbl %cl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $30, %r8 +; AVX512BW-NEXT: orq %rdx, %r8 +; AVX512BW-NEXT: shlq $31, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $32, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shlq $33, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $34, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: shlq $35, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $37, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $39, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r14, %rcx +; AVX512BW-NEXT: shlq $43, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r14, %rcx +; AVX512BW-NEXT: shlq $45, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: shlq $47, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %r15, %rcx +; AVX512BW-NEXT: shlq $49, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r15, %rcx +; AVX512BW-NEXT: shlq $51, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: shlq $53, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %rcx +; AVX512BW-NEXT: shlq $55, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r13, %rcx +; AVX512BW-NEXT: shlq $57, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: shlq $59, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r10, %rcx +; AVX512BW-NEXT: shlq $61, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $62, %r10 +; AVX512BW-NEXT: orq %rcx, %r10 +; AVX512BW-NEXT: shlq $63, %rdi +; AVX512BW-NEXT: orq %r10, %rdi +; AVX512BW-NEXT: orq %r11, %rdi +; AVX512BW-NEXT: movq %rdi, (%rsi) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r13 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: retq + %src.vec = load <16 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <96 x i32> + store <96 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor6_vf32(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor6_vf32: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbp +; AVX512F-ONLY-NEXT: pushq %r15 +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %r13 +; AVX512F-ONLY-NEXT: pushq %r12 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k0 +; AVX512F-ONLY-NEXT: kshiftrw $15, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %edx +; AVX512F-ONLY-NEXT: kshiftrw $10, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $9, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-ONLY-NEXT: kshiftrw $8, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %edi +; AVX512F-ONLY-NEXT: kshiftrw $7, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $15, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $5, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $2, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k1, %r14d +; AVX512F-ONLY-NEXT: movzbl %r14b, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: leaq (%r14,%r14,2), %r12 +; AVX512F-ONLY-NEXT: leaq (%r12,%r14,4), %r12 +; AVX512F-ONLY-NEXT: leaq (%r12,%r14,8), %r12 +; AVX512F-ONLY-NEXT: movq %r14, %r13 +; AVX512F-ONLY-NEXT: shlq $4, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $5, %r14 +; AVX512F-ONLY-NEXT: orq %r13, %r14 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $6, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $7, %rbp +; AVX512F-ONLY-NEXT: orq %r12, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $8, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $9, %rbp +; AVX512F-ONLY-NEXT: orq %r12, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $10, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $3, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $11, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rbp +; AVX512F-ONLY-NEXT: shlq $12, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $13, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rbp +; AVX512F-ONLY-NEXT: shlq $14, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $15, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rbp +; AVX512F-ONLY-NEXT: shlq $16, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: kmovw %k2, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $17, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $18, %rbp +; AVX512F-ONLY-NEXT: orq %r12, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $19, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $20, %rbp +; AVX512F-ONLY-NEXT: orq %r12, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $21, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $22, %rbp +; AVX512F-ONLY-NEXT: orq %r12, %rbp +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $23, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movzbl %r12b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rbp +; AVX512F-ONLY-NEXT: shlq $24, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $25, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rbp +; AVX512F-ONLY-NEXT: shlq $26, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $27, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rbp +; AVX512F-ONLY-NEXT: shlq $28, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $29, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: movzbl %r15b, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $30, %rbp +; AVX512F-ONLY-NEXT: orq %r12, %rbp +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $31, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rbp +; AVX512F-ONLY-NEXT: shlq $32, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $33, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rbp +; AVX512F-ONLY-NEXT: shlq $34, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: kmovw %k2, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $7, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $35, %r15 +; AVX512F-ONLY-NEXT: orq %rbp, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $36, %rbp +; AVX512F-ONLY-NEXT: orq %r15, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $37, %r15 +; AVX512F-ONLY-NEXT: orq %rbp, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $38, %rbp +; AVX512F-ONLY-NEXT: orq %r15, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $39, %r15 +; AVX512F-ONLY-NEXT: orq %rbp, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $40, %rbp +; AVX512F-ONLY-NEXT: orq %r15, %rbp +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $41, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rbp +; AVX512F-ONLY-NEXT: shlq $42, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $43, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rbp +; AVX512F-ONLY-NEXT: shlq $44, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $45, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rbp +; AVX512F-ONLY-NEXT: shlq $46, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: kmovw %k2, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $47, %r15 +; AVX512F-ONLY-NEXT: orq %rbp, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $48, %rbp +; AVX512F-ONLY-NEXT: orq %r15, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $49, %r15 +; AVX512F-ONLY-NEXT: orq %rbp, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $50, %rbp +; AVX512F-ONLY-NEXT: orq %r15, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $51, %r15 +; AVX512F-ONLY-NEXT: orq %rbp, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $52, %rbp +; AVX512F-ONLY-NEXT: orq %r15, %rbp +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $53, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rbp +; AVX512F-ONLY-NEXT: shlq $54, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $55, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rbp +; AVX512F-ONLY-NEXT: shlq $56, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $57, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rbp +; AVX512F-ONLY-NEXT: shlq $58, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $59, %r15 +; AVX512F-ONLY-NEXT: orq %rbp, %r15 +; AVX512F-ONLY-NEXT: movl %r12d, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $60, %rbp +; AVX512F-ONLY-NEXT: orq %r15, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $61, %r15 +; AVX512F-ONLY-NEXT: orq %rbp, %r15 +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $12, %k1, %k2 +; AVX512F-ONLY-NEXT: movzbl %r12b, %eax +; AVX512F-ONLY-NEXT: shlq $62, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $63, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: kmovw %k2, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k1, %k2 +; AVX512F-ONLY-NEXT: orq %r14, %r15 +; AVX512F-ONLY-NEXT: kmovw %k2, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k1, %k1 +; AVX512F-ONLY-NEXT: movq %r15, (%rsi) +; AVX512F-ONLY-NEXT: movzbl %r14b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leaq (%rax,%rax,2), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%r12,4), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%r12,8), %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $4, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $5, %r14 +; AVX512F-ONLY-NEXT: orq %r15, %r14 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $6, %rax +; AVX512F-ONLY-NEXT: shlq $7, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $9, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $11, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $13, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movzbl %r12b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $15, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $17, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $18, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: kmovw %k1, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $19, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $20, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $21, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $22, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $23, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $24, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k1, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $25, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movzbl %bl, %ebx +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %r13 +; AVX512F-ONLY-NEXT: shlq $26, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $27, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r13 +; AVX512F-ONLY-NEXT: shlq $28, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $29, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r13 +; AVX512F-ONLY-NEXT: shlq $30, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $31, %rbx +; AVX512F-ONLY-NEXT: orq %r13, %rbx +; AVX512F-ONLY-NEXT: kmovw %k0, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rbx +; AVX512F-ONLY-NEXT: shlq $33, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rbx +; AVX512F-ONLY-NEXT: shlq $35, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $36, %rbp +; AVX512F-ONLY-NEXT: orq %rbx, %rbp +; AVX512F-ONLY-NEXT: kmovw %k1, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $11, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $37, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $39, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rbp +; AVX512F-ONLY-NEXT: shlq $41, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $42, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: kmovw %k1, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $12, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $43, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $45, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $47, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $49, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %r11b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r11 +; AVX512F-ONLY-NEXT: shlq $50, %r11 +; AVX512F-ONLY-NEXT: orq %r12, %r11 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $51, %r12 +; AVX512F-ONLY-NEXT: orq %r11, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r11 +; AVX512F-ONLY-NEXT: shlq $52, %r11 +; AVX512F-ONLY-NEXT: orq %r12, %r11 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $53, %r12 +; AVX512F-ONLY-NEXT: orq %r11, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $54, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k1, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k0, %k0 +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %r12 +; AVX512F-ONLY-NEXT: shlq $56, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r12 +; AVX512F-ONLY-NEXT: shlq $58, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $59, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r13 +; AVX512F-ONLY-NEXT: shlq $60, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k0, %r12d +; AVX512F-ONLY-NEXT: shlq $61, %rbx +; AVX512F-ONLY-NEXT: orq %r13, %rbx +; AVX512F-ONLY-NEXT: movzbl %r10b, %eax +; AVX512F-ONLY-NEXT: # kill: def $r10d killed $r10d def $r10 +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: shlq $62, %r10 +; AVX512F-ONLY-NEXT: orq %rbx, %r10 +; AVX512F-ONLY-NEXT: movq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $63, %rbx +; AVX512F-ONLY-NEXT: orq %r10, %rbx +; AVX512F-ONLY-NEXT: orq %r14, %rbx +; AVX512F-ONLY-NEXT: movq %rbx, 8(%rsi) +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leaq (%rax,%rax,2), %r10 +; AVX512F-ONLY-NEXT: leaq (%r10,%rax,4), %r10 +; AVX512F-ONLY-NEXT: leaq (%r10,%rax,8), %rax +; AVX512F-ONLY-NEXT: movzbl %r9b, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rbx +; AVX512F-ONLY-NEXT: shlq $4, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r10, %r9 +; AVX512F-ONLY-NEXT: shlq $5, %r9 +; AVX512F-ONLY-NEXT: orq %rbx, %r9 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $6, %rax +; AVX512F-ONLY-NEXT: movq %r10, %rbx +; AVX512F-ONLY-NEXT: shlq $7, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $9, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movzbl %r8b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $10, %r8 +; AVX512F-ONLY-NEXT: orq %r10, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $11, %r10 +; AVX512F-ONLY-NEXT: orq %r8, %r10 +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $12, %r8 +; AVX512F-ONLY-NEXT: orq %r10, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $13, %r10 +; AVX512F-ONLY-NEXT: orq %r8, %r10 +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $14, %r8 +; AVX512F-ONLY-NEXT: orq %r10, %r8 +; AVX512F-ONLY-NEXT: shlq $15, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movzbl %dil, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $16, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $17, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $18, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $19, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $20, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $21, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $22, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $23, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $24, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $25, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $26, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $27, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movzbl %cl, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $28, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $29, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $30, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $31, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: andl $1, %ebp +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %rcx +; AVX512F-ONLY-NEXT: shlq $35, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %rcx +; AVX512F-ONLY-NEXT: shlq $37, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $39, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %rbp, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rcx +; AVX512F-ONLY-NEXT: shlq $41, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rcx +; AVX512F-ONLY-NEXT: shlq $43, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $45, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: movq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r11, %rcx +; AVX512F-ONLY-NEXT: shlq $47, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r11, %rcx +; AVX512F-ONLY-NEXT: shlq $49, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $51, %r11 +; AVX512F-ONLY-NEXT: orq %rax, %r11 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %r11, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $53, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $55, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $56, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $57, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movl %edx, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $58, %rcx +; AVX512F-ONLY-NEXT: orq %r12, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $59, %rdi +; AVX512F-ONLY-NEXT: orq %rcx, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $60, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $61, %rdi +; AVX512F-ONLY-NEXT: orq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $62, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: movzbl %dl, %ecx +; AVX512F-ONLY-NEXT: shlq $63, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: orq %r9, %rcx +; AVX512F-ONLY-NEXT: movq %rcx, 16(%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r12 +; AVX512F-ONLY-NEXT: popq %r13 +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: popq %r15 +; AVX512F-ONLY-NEXT: popq %rbp +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor6_vf32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %r13 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovw (%rdi), %k1 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %edx +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %ecx +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %edi +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r8d +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r9d +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r10d +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r11d +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %ebx +; AVX512DQ-NEXT: kshiftrw $5, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $2, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k1, %r14d +; AVX512DQ-NEXT: movzbl %r14b, %r14d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: leaq (%r14,%r14,2), %r12 +; AVX512DQ-NEXT: leaq (%r12,%r14,4), %r12 +; AVX512DQ-NEXT: leaq (%r12,%r14,8), %r12 +; AVX512DQ-NEXT: movq %r14, %r13 +; AVX512DQ-NEXT: shlq $4, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: shlq $5, %r14 +; AVX512DQ-NEXT: orq %r13, %r14 +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $6, %r12 +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $7, %rbp +; AVX512DQ-NEXT: orq %r12, %rbp +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $8, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $9, %rbp +; AVX512DQ-NEXT: orq %r12, %rbp +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $10, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $3, %k1, %k2 +; AVX512DQ-NEXT: shlq $11, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rbp +; AVX512DQ-NEXT: shlq $12, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $13, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r12, %rbp +; AVX512DQ-NEXT: shlq $14, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $15, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r12, %rbp +; AVX512DQ-NEXT: shlq $16, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: kmovw %k2, %r13d +; AVX512DQ-NEXT: kshiftrw $4, %k1, %k2 +; AVX512DQ-NEXT: shlq $17, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: movzbl %r13b, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $18, %rbp +; AVX512DQ-NEXT: orq %r12, %rbp +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $19, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $20, %rbp +; AVX512DQ-NEXT: orq %r12, %rbp +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $21, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $22, %rbp +; AVX512DQ-NEXT: orq %r12, %rbp +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $10, %k1, %k2 +; AVX512DQ-NEXT: shlq $23, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movzbl %r12b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rbp +; AVX512DQ-NEXT: shlq $24, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $25, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r12, %rbp +; AVX512DQ-NEXT: shlq $26, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $27, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r12, %rbp +; AVX512DQ-NEXT: shlq $28, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: shlq $29, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: movzbl %r15b, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $30, %rbp +; AVX512DQ-NEXT: orq %r12, %rbp +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k2 +; AVX512DQ-NEXT: shlq $31, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rbp +; AVX512DQ-NEXT: shlq $32, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $33, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r15, %rbp +; AVX512DQ-NEXT: shlq $34, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: kmovw %k2, %r13d +; AVX512DQ-NEXT: kshiftrw $7, %k1, %k2 +; AVX512DQ-NEXT: shlq $35, %r15 +; AVX512DQ-NEXT: orq %rbp, %r15 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $36, %rbp +; AVX512DQ-NEXT: orq %r15, %rbp +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $37, %r15 +; AVX512DQ-NEXT: orq %rbp, %r15 +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $38, %rbp +; AVX512DQ-NEXT: orq %r15, %rbp +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $39, %r15 +; AVX512DQ-NEXT: orq %rbp, %r15 +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $40, %rbp +; AVX512DQ-NEXT: orq %r15, %rbp +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 +; AVX512DQ-NEXT: shlq $41, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rbp +; AVX512DQ-NEXT: shlq $42, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $43, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r15, %rbp +; AVX512DQ-NEXT: shlq $44, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $45, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r15, %rbp +; AVX512DQ-NEXT: shlq $46, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: kmovw %k2, %r13d +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k2 +; AVX512DQ-NEXT: shlq $47, %r15 +; AVX512DQ-NEXT: orq %rbp, %r15 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $48, %rbp +; AVX512DQ-NEXT: orq %r15, %rbp +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $49, %r15 +; AVX512DQ-NEXT: orq %rbp, %r15 +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $50, %rbp +; AVX512DQ-NEXT: orq %r15, %rbp +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $51, %r15 +; AVX512DQ-NEXT: orq %rbp, %r15 +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $52, %rbp +; AVX512DQ-NEXT: orq %r15, %rbp +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $14, %k1, %k2 +; AVX512DQ-NEXT: shlq $53, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rbp +; AVX512DQ-NEXT: shlq $54, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $55, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r15, %rbp +; AVX512DQ-NEXT: shlq $56, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $57, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r15, %rbp +; AVX512DQ-NEXT: shlq $58, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: shlq $59, %r15 +; AVX512DQ-NEXT: orq %rbp, %r15 +; AVX512DQ-NEXT: movl %r12d, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $60, %rbp +; AVX512DQ-NEXT: orq %r15, %rbp +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $61, %r15 +; AVX512DQ-NEXT: orq %rbp, %r15 +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $12, %k1, %k2 +; AVX512DQ-NEXT: movzbl %r12b, %eax +; AVX512DQ-NEXT: shlq $62, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $63, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: kmovw %k2, %r13d +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k2 +; AVX512DQ-NEXT: orq %r14, %r15 +; AVX512DQ-NEXT: kmovw %k2, %r14d +; AVX512DQ-NEXT: kshiftrw $13, %k1, %k1 +; AVX512DQ-NEXT: movq %r15, (%rsi) +; AVX512DQ-NEXT: movzbl %r14b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leaq (%rax,%rax,2), %rax +; AVX512DQ-NEXT: leaq (%rax,%r12,4), %rax +; AVX512DQ-NEXT: leaq (%rax,%r12,8), %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $4, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $5, %r14 +; AVX512DQ-NEXT: orq %r15, %r14 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $6, %rax +; AVX512DQ-NEXT: shlq $7, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %r13b, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $9, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $11, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k1 +; AVX512DQ-NEXT: shlq $13, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movzbl %r12b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $15, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $17, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $18, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: kmovw %k1, %r15d +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512DQ-NEXT: shlq $19, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $20, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $21, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $22, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $23, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $24, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512DQ-NEXT: shlq $25, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movzbl %bl, %ebx +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %r13 +; AVX512DQ-NEXT: shlq $26, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $27, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %rbx, %r13 +; AVX512DQ-NEXT: shlq $28, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $29, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %rbx, %r13 +; AVX512DQ-NEXT: shlq $30, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: shlq $31, %rbx +; AVX512DQ-NEXT: orq %r13, %rbx +; AVX512DQ-NEXT: kmovw %k0, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r13, %rbx +; AVX512DQ-NEXT: shlq $33, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r13, %rbx +; AVX512DQ-NEXT: shlq $35, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $36, %rbp +; AVX512DQ-NEXT: orq %rbx, %rbp +; AVX512DQ-NEXT: kmovw %k1, %ebx +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k1 +; AVX512DQ-NEXT: shlq $37, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $39, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r15, %rbp +; AVX512DQ-NEXT: shlq $41, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $42, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: kmovw %k1, %ebp +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512DQ-NEXT: shlq $43, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $45, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $47, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: kmovw %k1, %r15d +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k1 +; AVX512DQ-NEXT: shlq $49, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %r11b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r11 +; AVX512DQ-NEXT: shlq $50, %r11 +; AVX512DQ-NEXT: orq %r12, %r11 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $51, %r12 +; AVX512DQ-NEXT: orq %r11, %r12 +; AVX512DQ-NEXT: movq %rax, %r11 +; AVX512DQ-NEXT: shlq $52, %r11 +; AVX512DQ-NEXT: orq %r12, %r11 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $53, %r12 +; AVX512DQ-NEXT: orq %r11, %r12 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $54, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k1, %r11d +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %r12 +; AVX512DQ-NEXT: shlq $56, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %rbx, %r12 +; AVX512DQ-NEXT: shlq $58, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $59, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %rbx, %r13 +; AVX512DQ-NEXT: shlq $60, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k0, %r12d +; AVX512DQ-NEXT: shlq $61, %rbx +; AVX512DQ-NEXT: orq %r13, %rbx +; AVX512DQ-NEXT: movzbl %r10b, %eax +; AVX512DQ-NEXT: # kill: def $r10d killed $r10d def $r10 +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: shlq $62, %r10 +; AVX512DQ-NEXT: orq %rbx, %r10 +; AVX512DQ-NEXT: movq %rax, %rbx +; AVX512DQ-NEXT: shlq $63, %rbx +; AVX512DQ-NEXT: orq %r10, %rbx +; AVX512DQ-NEXT: orq %r14, %rbx +; AVX512DQ-NEXT: movq %rbx, 8(%rsi) +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leaq (%rax,%rax,2), %r10 +; AVX512DQ-NEXT: leaq (%r10,%rax,4), %r10 +; AVX512DQ-NEXT: leaq (%r10,%rax,8), %rax +; AVX512DQ-NEXT: movzbl %r9b, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rbx +; AVX512DQ-NEXT: shlq $4, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r10, %r9 +; AVX512DQ-NEXT: shlq $5, %r9 +; AVX512DQ-NEXT: orq %rbx, %r9 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $6, %rax +; AVX512DQ-NEXT: movq %r10, %rbx +; AVX512DQ-NEXT: shlq $7, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: shlq $9, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movzbl %r8b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $10, %r8 +; AVX512DQ-NEXT: orq %r10, %r8 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: shlq $11, %r10 +; AVX512DQ-NEXT: orq %r8, %r10 +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $12, %r8 +; AVX512DQ-NEXT: orq %r10, %r8 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: shlq $13, %r10 +; AVX512DQ-NEXT: orq %r8, %r10 +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $14, %r8 +; AVX512DQ-NEXT: orq %r10, %r8 +; AVX512DQ-NEXT: shlq $15, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movzbl %dil, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $16, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $17, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $18, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $19, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $20, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: shlq $21, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $22, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $23, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $24, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $25, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $26, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: shlq $27, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movzbl %cl, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $28, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $29, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $30, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: shlq $31, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: shlq $33, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: andl $1, %ebp +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %rbp, %rcx +; AVX512DQ-NEXT: shlq $35, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %rbp, %rcx +; AVX512DQ-NEXT: shlq $37, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $39, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %rbp, %rax +; AVX512DQ-NEXT: movq %r15, %rcx +; AVX512DQ-NEXT: shlq $41, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r15, %rcx +; AVX512DQ-NEXT: shlq $43, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $45, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: movq %r11, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r11, %rcx +; AVX512DQ-NEXT: shlq $47, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r11, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r11, %rcx +; AVX512DQ-NEXT: shlq $49, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r11, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $51, %r11 +; AVX512DQ-NEXT: orq %rax, %r11 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %r11, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $53, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $55, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $56, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $57, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movl %edx, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $58, %rcx +; AVX512DQ-NEXT: orq %r12, %rcx +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $59, %rdi +; AVX512DQ-NEXT: orq %rcx, %rdi +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $60, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $61, %rdi +; AVX512DQ-NEXT: orq %rcx, %rdi +; AVX512DQ-NEXT: shlq $62, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: movzbl %dl, %ecx +; AVX512DQ-NEXT: shlq $63, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: orq %r9, %rcx +; AVX512DQ-NEXT: movq %rcx, 16(%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r13 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor6_vf32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r13 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovd (%rdi), %k0 +; AVX512BW-NEXT: kshiftrd $31, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: kshiftrd $26, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kshiftrd $25, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrd $23, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrd $22, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrd $21, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrd $15, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrd $14, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrd $5, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrd $1, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $2, %k0, %k1 +; AVX512BW-NEXT: kmovd %k0, %ebx +; AVX512BW-NEXT: movzbl %bl, %ebx +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: leaq (%rbx,%rbx,2), %r12 +; AVX512BW-NEXT: leaq (%r12,%rbx,4), %r12 +; AVX512BW-NEXT: leaq (%r12,%rbx,8), %r12 +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $4, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: shlq $5, %rbx +; AVX512BW-NEXT: orq %r13, %rbx +; AVX512BW-NEXT: movzbl %r15b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $6, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $7, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $8, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $9, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $10, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrd $3, %k0, %k1 +; AVX512BW-NEXT: shlq $11, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movzbl %r13b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $12, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $13, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $14, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $15, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $16, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $4, %k0, %k1 +; AVX512BW-NEXT: shlq $17, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movzbl %r12b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $18, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $19, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $20, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $21, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $22, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $10, %k0, %k1 +; AVX512BW-NEXT: shlq $23, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movzbl %r15b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $24, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $25, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $26, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $27, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $28, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: shlq $29, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movzbl %r14b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $30, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $6, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %r13 +; AVX512BW-NEXT: shlq $32, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $33, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r14, %r13 +; AVX512BW-NEXT: shlq $34, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $7, %k0, %k1 +; AVX512BW-NEXT: shlq $35, %r14 +; AVX512BW-NEXT: orq %r13, %r14 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $36, %r13 +; AVX512BW-NEXT: orq %r14, %r13 +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $37, %r14 +; AVX512BW-NEXT: orq %r13, %r14 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $38, %r13 +; AVX512BW-NEXT: orq %r14, %r13 +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $39, %r14 +; AVX512BW-NEXT: orq %r13, %r14 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $40, %r13 +; AVX512BW-NEXT: orq %r14, %r13 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrd $8, %k0, %k1 +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %r13 +; AVX512BW-NEXT: shlq $42, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $43, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r14, %r13 +; AVX512BW-NEXT: shlq $44, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $45, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r14, %r13 +; AVX512BW-NEXT: shlq $46, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $9, %k0, %k1 +; AVX512BW-NEXT: shlq $47, %r14 +; AVX512BW-NEXT: orq %r13, %r14 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $48, %r13 +; AVX512BW-NEXT: orq %r14, %r13 +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $49, %r14 +; AVX512BW-NEXT: orq %r13, %r14 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $50, %r13 +; AVX512BW-NEXT: orq %r14, %r13 +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $51, %r14 +; AVX512BW-NEXT: orq %r13, %r14 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $52, %r13 +; AVX512BW-NEXT: orq %r14, %r13 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrd $13, %k0, %k1 +; AVX512BW-NEXT: shlq $53, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %r13 +; AVX512BW-NEXT: shlq $54, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $55, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r14, %r13 +; AVX512BW-NEXT: shlq $56, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $57, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r14, %r13 +; AVX512BW-NEXT: shlq $58, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: shlq $59, %r14 +; AVX512BW-NEXT: orq %r13, %r14 +; AVX512BW-NEXT: movl %r15d, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $60, %r13 +; AVX512BW-NEXT: orq %r14, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $61, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrd $12, %k0, %k1 +; AVX512BW-NEXT: movzbl %r15b, %r15d +; AVX512BW-NEXT: shlq $62, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $63, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrd $11, %k0, %k1 +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 +; AVX512BW-NEXT: movq %rax, (%rsi) +; AVX512BW-NEXT: movzbl %bl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: leaq (%r15,%r15,2), %rax +; AVX512BW-NEXT: leaq (%rax,%r12,4), %rax +; AVX512BW-NEXT: leaq (%rax,%r12,8), %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $4, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rbx +; AVX512BW-NEXT: shlq $5, %rbx +; AVX512BW-NEXT: orq %r15, %rbx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %r13b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $9, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $11, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $12, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrd $17, %k0, %k1 +; AVX512BW-NEXT: shlq $13, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movzbl %r14b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: shlq $14, %r14 +; AVX512BW-NEXT: orq %r15, %r14 +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: shlq $15, %r15 +; AVX512BW-NEXT: orq %r14, %r15 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: shlq $16, %r14 +; AVX512BW-NEXT: orq %r15, %r14 +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: shlq $17, %r15 +; AVX512BW-NEXT: orq %r14, %r15 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $18, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrd $18, %k0, %k1 +; AVX512BW-NEXT: shlq $19, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $20, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $21, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $22, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $23, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %rbp +; AVX512BW-NEXT: shlq $24, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $19, %k0, %k1 +; AVX512BW-NEXT: shlq $25, %r12 +; AVX512BW-NEXT: orq %rbp, %r12 +; AVX512BW-NEXT: movzbl %r11b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: shlq $26, %r11 +; AVX512BW-NEXT: orq %r12, %r11 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $27, %r12 +; AVX512BW-NEXT: orq %r11, %r12 +; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: shlq $28, %r11 +; AVX512BW-NEXT: orq %r12, %r11 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $29, %r12 +; AVX512BW-NEXT: orq %r11, %r12 +; AVX512BW-NEXT: movq %rax, %rbp +; AVX512BW-NEXT: shlq $30, %rbp +; AVX512BW-NEXT: orq %r12, %rbp +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrd $20, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $32, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $33, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $34, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $35, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rbp +; AVX512BW-NEXT: shlq $36, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $27, %k0, %k1 +; AVX512BW-NEXT: shlq $37, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r14, %r13 +; AVX512BW-NEXT: shlq $39, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r14, %rbp +; AVX512BW-NEXT: shlq $41, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: movq %r14, %r13 +; AVX512BW-NEXT: shlq $42, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 +; AVX512BW-NEXT: shlq $43, %r14 +; AVX512BW-NEXT: orq %r13, %r14 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %r15, %r14 +; AVX512BW-NEXT: shlq $45, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %r15, %r14 +; AVX512BW-NEXT: shlq $47, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrd $29, %k0, %k1 +; AVX512BW-NEXT: shlq $49, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r11, %r15 +; AVX512BW-NEXT: shlq $51, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r11, %r15 +; AVX512BW-NEXT: shlq $53, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $30, %k0, %k0 +; AVX512BW-NEXT: shlq $55, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r12, %r11 +; AVX512BW-NEXT: shlq $57, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r12, %r11 +; AVX512BW-NEXT: shlq $59, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: kmovd %k0, %r11d +; AVX512BW-NEXT: shlq $61, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %r10b, %eax +; AVX512BW-NEXT: # kill: def $r10d killed $r10d def $r10 +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: shlq $62, %r10 +; AVX512BW-NEXT: orq %r12, %r10 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $63, %r12 +; AVX512BW-NEXT: orq %r10, %r12 +; AVX512BW-NEXT: orq %rbx, %r12 +; AVX512BW-NEXT: movq %r12, 8(%rsi) +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %r10 +; AVX512BW-NEXT: leaq (%r10,%rax,4), %r10 +; AVX512BW-NEXT: leaq (%r10,%rax,8), %rax +; AVX512BW-NEXT: movzbl %r9b, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %rbx +; AVX512BW-NEXT: shlq $4, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r10, %r9 +; AVX512BW-NEXT: shlq $5, %r9 +; AVX512BW-NEXT: orq %rbx, %r9 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movq %r10, %rbx +; AVX512BW-NEXT: shlq $7, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: shlq $9, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movzbl %r8b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $10, %r8 +; AVX512BW-NEXT: orq %r10, %r8 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: shlq $11, %r10 +; AVX512BW-NEXT: orq %r8, %r10 +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $12, %r8 +; AVX512BW-NEXT: orq %r10, %r8 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: shlq $13, %r10 +; AVX512BW-NEXT: orq %r8, %r10 +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $14, %r8 +; AVX512BW-NEXT: orq %r10, %r8 +; AVX512BW-NEXT: shlq $15, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movzbl %dil, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $16, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $17, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $18, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $19, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $20, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: shlq $21, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $22, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $23, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $24, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $25, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $26, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: shlq $27, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movzbl %cl, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $28, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $29, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $30, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: shlq $31, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: shlq $33, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %rbp, %rcx +; AVX512BW-NEXT: shlq $35, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %rbp, %rcx +; AVX512BW-NEXT: shlq $37, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: shlq $39, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: movq %r14, %rcx +; AVX512BW-NEXT: shlq $41, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r14, %rcx +; AVX512BW-NEXT: shlq $43, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: shlq $45, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %r15, %rcx +; AVX512BW-NEXT: shlq $47, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r15, %rcx +; AVX512BW-NEXT: shlq $49, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: shlq $51, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r11, %rcx +; AVX512BW-NEXT: shlq $53, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r11, %rcx +; AVX512BW-NEXT: shlq $55, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: shlq $57, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movl %edx, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $58, %rcx +; AVX512BW-NEXT: orq %r11, %rcx +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $59, %rdi +; AVX512BW-NEXT: orq %rcx, %rdi +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $60, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $61, %rdi +; AVX512BW-NEXT: orq %rcx, %rdi +; AVX512BW-NEXT: shlq $62, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movzbl %dl, %ecx +; AVX512BW-NEXT: shlq $63, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: orq %r9, %rcx +; AVX512BW-NEXT: movq %rcx, 16(%rsi) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r13 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: retq + %src.vec = load <32 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <192 x i32> + store <192 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor6_vf64(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor6_vf64: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbp +; AVX512F-ONLY-NEXT: pushq %r15 +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %r13 +; AVX512F-ONLY-NEXT: pushq %r12 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k0 +; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k2 +; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 +; AVX512F-ONLY-NEXT: kshiftrw $3, %k0, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-ONLY-NEXT: kshiftrw $15, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $14, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %edx +; AVX512F-ONLY-NEXT: kshiftrw $13, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %edi +; AVX512F-ONLY-NEXT: kshiftrw $12, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k0, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k0, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $10, %k2, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $1, %k2, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k2, %k4 +; AVX512F-ONLY-NEXT: kmovw %k2, %r14d +; AVX512F-ONLY-NEXT: movzbl %r14b, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: leaq (%r14,%r14,2), %r12 +; AVX512F-ONLY-NEXT: leaq (%r12,%r14,4), %r12 +; AVX512F-ONLY-NEXT: leaq (%r12,%r14,8), %r12 +; AVX512F-ONLY-NEXT: movq %r14, %r13 +; AVX512F-ONLY-NEXT: shlq $4, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $5, %r14 +; AVX512F-ONLY-NEXT: orq %r13, %r14 +; AVX512F-ONLY-NEXT: movzbl %r15b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $6, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $7, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $8, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $9, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $10, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $11, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $12, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $13, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $14, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $15, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $16, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $17, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movzbl %r12b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $18, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $19, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $20, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $21, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $22, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $23, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %r15b, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $24, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $25, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $26, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $27, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $28, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $29, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movzbl %r12b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $30, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $31, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $32, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $33, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $34, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $7, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $35, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %r15b, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $36, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $37, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $38, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $39, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $40, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $41, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $42, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $43, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $44, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $45, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $46, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $47, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $48, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $49, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $50, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $51, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $52, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $53, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movzbl %r12b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $54, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $55, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $56, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $57, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $58, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $59, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r15d +; AVX512F-ONLY-NEXT: movl %ebp, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $60, %rbp +; AVX512F-ONLY-NEXT: orq %r12, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $2, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $62, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $63, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: orq %r14, %r12 +; AVX512F-ONLY-NEXT: movq %r12, 24(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k3, %r14d +; AVX512F-ONLY-NEXT: movzbl %r14b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: leaq (%r12,%r12,2), %r14 +; AVX512F-ONLY-NEXT: leaq (%r14,%r12,4), %r14 +; AVX512F-ONLY-NEXT: leaq (%r14,%r12,8), %r14 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $4, %r13 +; AVX512F-ONLY-NEXT: orq %r14, %r13 +; AVX512F-ONLY-NEXT: shlq $5, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %r14 +; AVX512F-ONLY-NEXT: shlq $6, %r14 +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $7, %rbp +; AVX512F-ONLY-NEXT: orq %r14, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r14 +; AVX512F-ONLY-NEXT: shlq $8, %r14 +; AVX512F-ONLY-NEXT: orq %rbp, %r14 +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $9, %rbp +; AVX512F-ONLY-NEXT: orq %r14, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r14 +; AVX512F-ONLY-NEXT: shlq $10, %r14 +; AVX512F-ONLY-NEXT: orq %rbp, %r14 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $3, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $11, %r13 +; AVX512F-ONLY-NEXT: orq %r14, %r13 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %rbp +; AVX512F-ONLY-NEXT: shlq $12, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r14, %r13 +; AVX512F-ONLY-NEXT: shlq $13, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r14, %rbp +; AVX512F-ONLY-NEXT: shlq $14, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r14, %r13 +; AVX512F-ONLY-NEXT: shlq $15, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r14, %rbp +; AVX512F-ONLY-NEXT: shlq $16, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $17, %r14 +; AVX512F-ONLY-NEXT: orq %rbp, %r14 +; AVX512F-ONLY-NEXT: movzbl %r13b, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $18, %rbp +; AVX512F-ONLY-NEXT: orq %r14, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r14 +; AVX512F-ONLY-NEXT: shlq $19, %r14 +; AVX512F-ONLY-NEXT: orq %rbp, %r14 +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $20, %rbp +; AVX512F-ONLY-NEXT: orq %r14, %rbp +; AVX512F-ONLY-NEXT: movq %r13, %r14 +; AVX512F-ONLY-NEXT: shlq $21, %r14 +; AVX512F-ONLY-NEXT: orq %rbp, %r14 +; AVX512F-ONLY-NEXT: movq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $22, %rbp +; AVX512F-ONLY-NEXT: orq %r14, %rbp +; AVX512F-ONLY-NEXT: kmovw %k4, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $23, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movzbl %r14b, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %rbp +; AVX512F-ONLY-NEXT: shlq $24, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r14, %r13 +; AVX512F-ONLY-NEXT: shlq $25, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r14, %rbp +; AVX512F-ONLY-NEXT: shlq $26, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movq %r14, %r13 +; AVX512F-ONLY-NEXT: shlq $27, %r13 +; AVX512F-ONLY-NEXT: orq %rbp, %r13 +; AVX512F-ONLY-NEXT: movq %r14, %rbp +; AVX512F-ONLY-NEXT: shlq $28, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: shlq $29, %r14 +; AVX512F-ONLY-NEXT: orq %rbp, %r14 +; AVX512F-ONLY-NEXT: movzbl %bl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $30, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $6, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $31, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r14 +; AVX512F-ONLY-NEXT: shlq $33, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: kmovw %k4, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $7, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $35, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r14, %rbx +; AVX512F-ONLY-NEXT: shlq $37, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r14, %rbx +; AVX512F-ONLY-NEXT: shlq $39, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: kmovw %k4, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $8, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $41, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r14 +; AVX512F-ONLY-NEXT: shlq $43, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r14 +; AVX512F-ONLY-NEXT: shlq $45, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: kmovw %k4, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k3, %k3 +; AVX512F-ONLY-NEXT: shlq $47, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r14, %rbx +; AVX512F-ONLY-NEXT: shlq $49, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r14, %rbx +; AVX512F-ONLY-NEXT: shlq $51, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $10, %k1, %k3 +; AVX512F-ONLY-NEXT: shlq $53, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r14 +; AVX512F-ONLY-NEXT: shlq $55, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $56, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r14 +; AVX512F-ONLY-NEXT: shlq $57, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %rbx, %rax +; AVX512F-ONLY-NEXT: shlq $58, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $59, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movl %ebp, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r14 +; AVX512F-ONLY-NEXT: shlq $60, %r14 +; AVX512F-ONLY-NEXT: orq %rbx, %r14 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $61, %r13 +; AVX512F-ONLY-NEXT: orq %r14, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k2, %k3 +; AVX512F-ONLY-NEXT: movzbl %bpl, %ebx +; AVX512F-ONLY-NEXT: shlq $62, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r13 +; AVX512F-ONLY-NEXT: shlq $63, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $11, %k2, %k3 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $13, %k2, %k3 +; AVX512F-ONLY-NEXT: movq %r13, (%rsi) +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: leaq (%r15,%r15,2), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%r13,4), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%r13,8), %r15 +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $5, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $6, %r12 +; AVX512F-ONLY-NEXT: shlq $7, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $9, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $11, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $14, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $13, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $15, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $17, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $18, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $15, %k2, %k2 +; AVX512F-ONLY-NEXT: shlq $19, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $20, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $21, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $22, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $23, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $24, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $1, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $25, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $26, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $27, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $28, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $29, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $30, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $31, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k1, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $33, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $35, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $37, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $39, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $41, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $43, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $45, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $47, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $49, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $51, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $53, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $55, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $56, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $57, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $58, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $59, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $60, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $6, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %eax +; AVX512F-ONLY-NEXT: movl %ebp, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: shlq $62, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $63, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $7, %k1, %k2 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movq %r12, 32(%rsi) +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leaq (%rax,%rax,2), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%rax,8), %rax +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $5, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $6, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $7, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $8, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $9, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $11, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $13, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $9, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $15, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $17, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $18, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $19, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $20, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $15, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $21, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $22, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $23, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $24, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $25, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $26, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $27, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %r14b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $28, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $29, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $30, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $11, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %r13 +; AVX512F-ONLY-NEXT: shlq $32, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $33, %r14 +; AVX512F-ONLY-NEXT: orq %r13, %r14 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $35, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $37, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $39, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $41, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $43, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k1, %k1 +; AVX512F-ONLY-NEXT: shlq $45, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $47, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $49, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $51, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $53, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $55, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $56, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $57, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movzbl %bpl, %eax +; AVX512F-ONLY-NEXT: movl %ebp, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $58, %r13 +; AVX512F-ONLY-NEXT: orq %r14, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $59, %r14 +; AVX512F-ONLY-NEXT: orq %r13, %r14 +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $60, %r13 +; AVX512F-ONLY-NEXT: orq %r14, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rbp +; AVX512F-ONLY-NEXT: shlq $61, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: kmovw %k1, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $7, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $62, %r12 +; AVX512F-ONLY-NEXT: orq %rbp, %r12 +; AVX512F-ONLY-NEXT: shlq $63, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %rax, 40(%rsi) +; AVX512F-ONLY-NEXT: movzbl %r14b, %r15d +; AVX512F-ONLY-NEXT: movl %r15d, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leaq (%rax,%rax,2), %r12 +; AVX512F-ONLY-NEXT: leaq (%r12,%rax,4), %r12 +; AVX512F-ONLY-NEXT: leaq (%r12,%rax,8), %rax +; AVX512F-ONLY-NEXT: movzbl %r11b, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %r11 +; AVX512F-ONLY-NEXT: shlq $5, %r11 +; AVX512F-ONLY-NEXT: orq %r12, %r11 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $6, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $7, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $9, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $11, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $13, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $9, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $15, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $17, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $18, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $19, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $20, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $15, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $21, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $22, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $23, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $24, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $25, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $26, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $27, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %r10b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $28, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $29, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $30, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k1, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $11, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %r13 +; AVX512F-ONLY-NEXT: shlq $32, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k1, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $33, %r10 +; AVX512F-ONLY-NEXT: orq %r13, %r10 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $35, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $37, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $39, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $41, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $43, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $45, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $47, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $49, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: kmovw %k1, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $51, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r10 +; AVX512F-ONLY-NEXT: shlq $53, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r10 +; AVX512F-ONLY-NEXT: shlq $55, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $56, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $57, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movl %ebp, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $58, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $59, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $60, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k1, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k0, %k1 +; AVX512F-ONLY-NEXT: shlq $62, %r10 +; AVX512F-ONLY-NEXT: orq %r12, %r10 +; AVX512F-ONLY-NEXT: kmovw %k1, %r12d +; AVX512F-ONLY-NEXT: movzbl %bpl, %ebp +; AVX512F-ONLY-NEXT: shlq $63, %rbp +; AVX512F-ONLY-NEXT: orq %r10, %rbp +; AVX512F-ONLY-NEXT: kmovw %k0, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k0, %k0 +; AVX512F-ONLY-NEXT: orq %r11, %rbp +; AVX512F-ONLY-NEXT: kmovw %k0, %r11d +; AVX512F-ONLY-NEXT: movq %rbp, 16(%rsi) +; AVX512F-ONLY-NEXT: movzbl %r9b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: leaq (%rbx,%rbx,2), %r9 +; AVX512F-ONLY-NEXT: leaq (%r9,%rax,4), %r9 +; AVX512F-ONLY-NEXT: leaq (%r9,%rax,8), %r9 +; AVX512F-ONLY-NEXT: movq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $4, %rbx +; AVX512F-ONLY-NEXT: orq %r9, %rbx +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $5, %r9 +; AVX512F-ONLY-NEXT: orq %rbx, %r9 +; AVX512F-ONLY-NEXT: movq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $6, %rbx +; AVX512F-ONLY-NEXT: shlq $7, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movzbl %r8b, %r8d +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %rbx +; AVX512F-ONLY-NEXT: shlq $8, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $9, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r8, %rbx +; AVX512F-ONLY-NEXT: shlq $10, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $11, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r8, %rbx +; AVX512F-ONLY-NEXT: shlq $12, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $13, %r8 +; AVX512F-ONLY-NEXT: orq %rbx, %r8 +; AVX512F-ONLY-NEXT: movzbl %dil, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $14, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $15, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $16, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $17, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $18, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: shlq $19, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: movzbl %dl, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $20, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $21, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $22, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $23, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $24, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $25, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movzbl %cl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $26, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $27, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $28, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $29, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $30, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rcx +; AVX512F-ONLY-NEXT: shlq $32, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r10, %rcx +; AVX512F-ONLY-NEXT: shlq $34, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r10, %rcx +; AVX512F-ONLY-NEXT: shlq $36, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $37, %r10 +; AVX512F-ONLY-NEXT: orq %rcx, %r10 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $39, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $41, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $43, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $45, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $47, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $49, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $50, %rcx +; AVX512F-ONLY-NEXT: orq %r12, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $51, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $52, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $53, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $54, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: movq %r11, %rcx +; AVX512F-ONLY-NEXT: shlq $56, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r11, %rcx +; AVX512F-ONLY-NEXT: shlq $58, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $59, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r11, %rcx +; AVX512F-ONLY-NEXT: shlq $60, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $61, %r11 +; AVX512F-ONLY-NEXT: orq %rcx, %r11 +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: shlq $62, %r14 +; AVX512F-ONLY-NEXT: orq %r11, %r14 +; AVX512F-ONLY-NEXT: shlq $63, %r15 +; AVX512F-ONLY-NEXT: orq %r14, %r15 +; AVX512F-ONLY-NEXT: orq %r9, %r15 +; AVX512F-ONLY-NEXT: movq %r15, 8(%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r12 +; AVX512F-ONLY-NEXT: popq %r13 +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: popq %r15 +; AVX512F-ONLY-NEXT: popq %rbp +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor6_vf64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %r13 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovw (%rdi), %k3 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k2 +; AVX512DQ-NEXT: kmovw 6(%rdi), %k1 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k4 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQ-NEXT: kshiftrw $15, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k4, %ecx +; AVX512DQ-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k4, %edx +; AVX512DQ-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k4, %edi +; AVX512DQ-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r8d +; AVX512DQ-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r9d +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r10d +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r11d +; AVX512DQ-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k4, %ebx +; AVX512DQ-NEXT: kshiftrw $10, %k2, %k4 +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $1, %k2, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $2, %k2, %k4 +; AVX512DQ-NEXT: kmovw %k2, %r14d +; AVX512DQ-NEXT: movzbl %r14b, %r14d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: leaq (%r14,%r14,2), %r12 +; AVX512DQ-NEXT: leaq (%r12,%r14,4), %r12 +; AVX512DQ-NEXT: leaq (%r12,%r14,8), %r12 +; AVX512DQ-NEXT: movq %r14, %r13 +; AVX512DQ-NEXT: shlq $4, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: shlq $5, %r14 +; AVX512DQ-NEXT: orq %r13, %r14 +; AVX512DQ-NEXT: movzbl %r15b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $6, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $7, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $8, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $9, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $10, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $3, %k2, %k4 +; AVX512DQ-NEXT: shlq $11, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movzbl %r13b, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $12, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $13, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $14, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $15, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $16, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $4, %k2, %k4 +; AVX512DQ-NEXT: shlq $17, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movzbl %r12b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $18, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $19, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $20, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $21, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $22, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $5, %k2, %k4 +; AVX512DQ-NEXT: shlq $23, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %r15b, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $24, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $25, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $26, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $27, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $28, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $6, %k2, %k4 +; AVX512DQ-NEXT: shlq $29, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movzbl %r12b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $30, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $31, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $32, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $33, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $34, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $7, %k2, %k4 +; AVX512DQ-NEXT: shlq $35, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %r15b, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $36, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $37, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $38, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $39, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $40, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $8, %k2, %k4 +; AVX512DQ-NEXT: shlq $41, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $42, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $43, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $44, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $45, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $46, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $9, %k2, %k4 +; AVX512DQ-NEXT: shlq $47, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $48, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $49, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $50, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $51, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $52, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $1, %k3, %k4 +; AVX512DQ-NEXT: shlq $53, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movzbl %r12b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $54, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $55, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $56, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $57, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $58, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: shlq $59, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %r15d +; AVX512DQ-NEXT: movl %ebp, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $60, %rbp +; AVX512DQ-NEXT: orq %r12, %rbp +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $2, %k3, %k4 +; AVX512DQ-NEXT: shlq $62, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $63, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: orq %r14, %r12 +; AVX512DQ-NEXT: movq %r12, 24(%rsi) +; AVX512DQ-NEXT: kmovw %k3, %r14d +; AVX512DQ-NEXT: movzbl %r14b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: leaq (%r12,%r12,2), %r14 +; AVX512DQ-NEXT: leaq (%r14,%r12,4), %r14 +; AVX512DQ-NEXT: leaq (%r14,%r12,8), %r14 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $4, %r13 +; AVX512DQ-NEXT: orq %r14, %r13 +; AVX512DQ-NEXT: shlq $5, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %r14 +; AVX512DQ-NEXT: shlq $6, %r14 +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $7, %rbp +; AVX512DQ-NEXT: orq %r14, %rbp +; AVX512DQ-NEXT: movq %r13, %r14 +; AVX512DQ-NEXT: shlq $8, %r14 +; AVX512DQ-NEXT: orq %rbp, %r14 +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $9, %rbp +; AVX512DQ-NEXT: orq %r14, %rbp +; AVX512DQ-NEXT: movq %r13, %r14 +; AVX512DQ-NEXT: shlq $10, %r14 +; AVX512DQ-NEXT: orq %rbp, %r14 +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQ-NEXT: shlq $11, %r13 +; AVX512DQ-NEXT: orq %r14, %r13 +; AVX512DQ-NEXT: movzbl %bpl, %r14d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %rbp +; AVX512DQ-NEXT: shlq $12, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r14, %r13 +; AVX512DQ-NEXT: shlq $13, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r14, %rbp +; AVX512DQ-NEXT: shlq $14, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r14, %r13 +; AVX512DQ-NEXT: shlq $15, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r14, %rbp +; AVX512DQ-NEXT: shlq $16, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQ-NEXT: shlq $17, %r14 +; AVX512DQ-NEXT: orq %rbp, %r14 +; AVX512DQ-NEXT: movzbl %r13b, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $18, %rbp +; AVX512DQ-NEXT: orq %r14, %rbp +; AVX512DQ-NEXT: movq %r13, %r14 +; AVX512DQ-NEXT: shlq $19, %r14 +; AVX512DQ-NEXT: orq %rbp, %r14 +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $20, %rbp +; AVX512DQ-NEXT: orq %r14, %rbp +; AVX512DQ-NEXT: movq %r13, %r14 +; AVX512DQ-NEXT: shlq $21, %r14 +; AVX512DQ-NEXT: orq %rbp, %r14 +; AVX512DQ-NEXT: movq %r13, %rbp +; AVX512DQ-NEXT: shlq $22, %rbp +; AVX512DQ-NEXT: orq %r14, %rbp +; AVX512DQ-NEXT: kmovw %k4, %r14d +; AVX512DQ-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQ-NEXT: shlq $23, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movzbl %r14b, %r14d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %rbp +; AVX512DQ-NEXT: shlq $24, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r14, %r13 +; AVX512DQ-NEXT: shlq $25, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r14, %rbp +; AVX512DQ-NEXT: shlq $26, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movq %r14, %r13 +; AVX512DQ-NEXT: shlq $27, %r13 +; AVX512DQ-NEXT: orq %rbp, %r13 +; AVX512DQ-NEXT: movq %r14, %rbp +; AVX512DQ-NEXT: shlq $28, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: shlq $29, %r14 +; AVX512DQ-NEXT: orq %rbp, %r14 +; AVX512DQ-NEXT: movzbl %bl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $30, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQ-NEXT: shlq $31, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %rbx, %r14 +; AVX512DQ-NEXT: shlq $33, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: kmovw %k4, %r14d +; AVX512DQ-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQ-NEXT: shlq $35, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r14, %rbx +; AVX512DQ-NEXT: shlq $37, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r14, %rbx +; AVX512DQ-NEXT: shlq $39, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: kmovw %k4, %ebx +; AVX512DQ-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQ-NEXT: shlq $41, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %rbx, %r14 +; AVX512DQ-NEXT: shlq $43, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %rbx, %r14 +; AVX512DQ-NEXT: shlq $45, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: kmovw %k4, %r14d +; AVX512DQ-NEXT: kshiftrw $9, %k3, %k3 +; AVX512DQ-NEXT: shlq $47, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r14, %rbx +; AVX512DQ-NEXT: shlq $49, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r14, %rbx +; AVX512DQ-NEXT: shlq $51, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: kmovw %k3, %ebx +; AVX512DQ-NEXT: kshiftrw $10, %k1, %k3 +; AVX512DQ-NEXT: shlq $53, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %rbx, %r14 +; AVX512DQ-NEXT: shlq $55, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $56, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %rbx, %r14 +; AVX512DQ-NEXT: shlq $57, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %rbx, %rax +; AVX512DQ-NEXT: shlq $58, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: shlq $59, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movl %ebp, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r14 +; AVX512DQ-NEXT: shlq $60, %r14 +; AVX512DQ-NEXT: orq %rbx, %r14 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $61, %r13 +; AVX512DQ-NEXT: orq %r14, %r13 +; AVX512DQ-NEXT: kmovw %k3, %r14d +; AVX512DQ-NEXT: kshiftrw $12, %k2, %k3 +; AVX512DQ-NEXT: movzbl %bpl, %ebx +; AVX512DQ-NEXT: shlq $62, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %rbx, %r13 +; AVX512DQ-NEXT: shlq $63, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $11, %k2, %k3 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $13, %k2, %k3 +; AVX512DQ-NEXT: movq %r13, (%rsi) +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: leaq (%r15,%r15,2), %r15 +; AVX512DQ-NEXT: leaq (%r15,%r13,4), %r15 +; AVX512DQ-NEXT: leaq (%r15,%r13,8), %r15 +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $4, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $5, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $6, %r12 +; AVX512DQ-NEXT: shlq $7, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $9, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $11, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $14, %k2, %k3 +; AVX512DQ-NEXT: shlq $13, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $15, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $17, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $18, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $15, %k2, %k2 +; AVX512DQ-NEXT: shlq $19, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $20, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $21, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $22, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $23, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $24, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k2 +; AVX512DQ-NEXT: shlq $25, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $26, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $27, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $28, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $29, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $30, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: shlq $31, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $33, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $35, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k2, %r13d +; AVX512DQ-NEXT: kshiftrw $2, %k1, %k2 +; AVX512DQ-NEXT: shlq $37, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $39, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $41, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $3, %k1, %k2 +; AVX512DQ-NEXT: shlq $43, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $45, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $47, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k2, %r13d +; AVX512DQ-NEXT: kshiftrw $4, %k1, %k2 +; AVX512DQ-NEXT: shlq $49, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $51, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $53, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $5, %k1, %k2 +; AVX512DQ-NEXT: shlq $55, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $56, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $57, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $58, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $59, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $60, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k2 +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %eax +; AVX512DQ-NEXT: movl %ebp, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: shlq $62, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $63, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $7, %k1, %k2 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movq %r12, 32(%rsi) +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leaq (%rax,%rax,2), %r15 +; AVX512DQ-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512DQ-NEXT: leaq (%r15,%rax,8), %rax +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $4, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $5, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $6, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $7, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 +; AVX512DQ-NEXT: shlq $9, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $11, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $13, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k2 +; AVX512DQ-NEXT: shlq $15, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $17, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $18, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $19, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $20, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k2 +; AVX512DQ-NEXT: shlq $21, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $22, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $23, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $24, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $25, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $26, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: shlq $27, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %r14b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $28, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $29, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $30, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k2 +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %r13 +; AVX512DQ-NEXT: shlq $32, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $12, %k1, %k2 +; AVX512DQ-NEXT: shlq $33, %r14 +; AVX512DQ-NEXT: orq %r13, %r14 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $35, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $37, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: kmovw %k2, %r14d +; AVX512DQ-NEXT: kshiftrw $13, %k1, %k2 +; AVX512DQ-NEXT: shlq $39, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $41, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $43, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $14, %k1, %k1 +; AVX512DQ-NEXT: shlq $45, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $47, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $49, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: kmovw %k1, %r14d +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k1 +; AVX512DQ-NEXT: shlq $51, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $53, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $55, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $56, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: shlq $57, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movzbl %bpl, %eax +; AVX512DQ-NEXT: movl %ebp, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $58, %r13 +; AVX512DQ-NEXT: orq %r14, %r13 +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $59, %r14 +; AVX512DQ-NEXT: orq %r13, %r14 +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $60, %r13 +; AVX512DQ-NEXT: orq %r14, %r13 +; AVX512DQ-NEXT: movq %r12, %rbp +; AVX512DQ-NEXT: shlq $61, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: kmovw %k1, %r14d +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k1 +; AVX512DQ-NEXT: shlq $62, %r12 +; AVX512DQ-NEXT: orq %rbp, %r12 +; AVX512DQ-NEXT: shlq $63, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %rax, 40(%rsi) +; AVX512DQ-NEXT: movzbl %r14b, %r15d +; AVX512DQ-NEXT: movl %r15d, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leaq (%rax,%rax,2), %r12 +; AVX512DQ-NEXT: leaq (%r12,%rax,4), %r12 +; AVX512DQ-NEXT: leaq (%r12,%rax,8), %rax +; AVX512DQ-NEXT: movzbl %r11b, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $4, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %r11 +; AVX512DQ-NEXT: shlq $5, %r11 +; AVX512DQ-NEXT: orq %r12, %r11 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $6, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $7, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k1, %ebp +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512DQ-NEXT: shlq $9, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $11, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $13, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k1, %ebp +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k1 +; AVX512DQ-NEXT: shlq $15, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $17, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $18, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $19, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $20, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k1, %ebp +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k1 +; AVX512DQ-NEXT: shlq $21, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $22, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $23, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $24, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $25, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $26, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: shlq $27, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %r10b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $28, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $29, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $30, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k1, %ebp +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k1 +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %r13 +; AVX512DQ-NEXT: shlq $32, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512DQ-NEXT: shlq $33, %r10 +; AVX512DQ-NEXT: orq %r13, %r10 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $35, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $37, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: kmovw %k1, %r10d +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k1 +; AVX512DQ-NEXT: shlq $39, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $41, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $43, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512DQ-NEXT: shlq $45, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $47, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $49, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: kmovw %k1, %r13d +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k1 +; AVX512DQ-NEXT: shlq $51, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r10 +; AVX512DQ-NEXT: shlq $53, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r13, %r10 +; AVX512DQ-NEXT: shlq $55, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $56, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: shlq $57, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movl %ebp, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $58, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $59, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $60, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k1, %r13d +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512DQ-NEXT: shlq $62, %r10 +; AVX512DQ-NEXT: orq %r12, %r10 +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: movzbl %bpl, %ebp +; AVX512DQ-NEXT: shlq $63, %rbp +; AVX512DQ-NEXT: orq %r10, %rbp +; AVX512DQ-NEXT: kmovw %k0, %r10d +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 +; AVX512DQ-NEXT: orq %r11, %rbp +; AVX512DQ-NEXT: kmovw %k0, %r11d +; AVX512DQ-NEXT: movq %rbp, 16(%rsi) +; AVX512DQ-NEXT: movzbl %r9b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: leaq (%rbx,%rbx,2), %r9 +; AVX512DQ-NEXT: leaq (%r9,%rax,4), %r9 +; AVX512DQ-NEXT: leaq (%r9,%rax,8), %r9 +; AVX512DQ-NEXT: movq %rax, %rbx +; AVX512DQ-NEXT: shlq $4, %rbx +; AVX512DQ-NEXT: orq %r9, %rbx +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $5, %r9 +; AVX512DQ-NEXT: orq %rbx, %r9 +; AVX512DQ-NEXT: movq %rax, %rbx +; AVX512DQ-NEXT: shlq $6, %rbx +; AVX512DQ-NEXT: shlq $7, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movzbl %r8b, %r8d +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %rbx +; AVX512DQ-NEXT: shlq $8, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $9, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r8, %rbx +; AVX512DQ-NEXT: shlq $10, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $11, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r8, %rbx +; AVX512DQ-NEXT: shlq $12, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: shlq $13, %r8 +; AVX512DQ-NEXT: orq %rbx, %r8 +; AVX512DQ-NEXT: movzbl %dil, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $14, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $15, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $16, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $17, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $18, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: shlq $19, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: movzbl %dl, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $20, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: movq %rdx, %rax +; AVX512DQ-NEXT: shlq $21, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $22, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: movq %rdx, %rax +; AVX512DQ-NEXT: shlq $23, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $24, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: shlq $25, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movzbl %cl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $26, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $27, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $28, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $29, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $30, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rcx +; AVX512DQ-NEXT: shlq $32, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $33, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r10, %rcx +; AVX512DQ-NEXT: shlq $34, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $35, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r10, %rcx +; AVX512DQ-NEXT: shlq $36, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $37, %r10 +; AVX512DQ-NEXT: orq %rcx, %r10 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $39, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $41, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $43, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $45, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $47, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $49, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $50, %rcx +; AVX512DQ-NEXT: orq %r12, %rcx +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $51, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $52, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $53, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $54, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: movq %r11, %rcx +; AVX512DQ-NEXT: shlq $56, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r11, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r11, %rcx +; AVX512DQ-NEXT: shlq $58, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r11, %rax +; AVX512DQ-NEXT: shlq $59, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r11, %rcx +; AVX512DQ-NEXT: shlq $60, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $61, %r11 +; AVX512DQ-NEXT: orq %rcx, %r11 +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: shlq $62, %r14 +; AVX512DQ-NEXT: orq %r11, %r14 +; AVX512DQ-NEXT: shlq $63, %r15 +; AVX512DQ-NEXT: orq %r14, %r15 +; AVX512DQ-NEXT: orq %r9, %r15 +; AVX512DQ-NEXT: movq %r15, 8(%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r13 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor6_vf64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r13 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: kshiftrq $15, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrq $14, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kshiftrq $13, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: kshiftrq $12, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrq $26, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrq $22, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrq $58, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrq $44, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $42, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrq $5, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrq $1, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $2, %k0, %k1 +; AVX512BW-NEXT: kmovd %k0, %r10d +; AVX512BW-NEXT: movzbl %r10b, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: leaq (%r10,%r10,2), %r12 +; AVX512BW-NEXT: leaq (%r12,%r10,4), %r12 +; AVX512BW-NEXT: leaq (%r12,%r10,8), %r12 +; AVX512BW-NEXT: movq %r10, %r13 +; AVX512BW-NEXT: shlq $4, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: shlq $5, %r10 +; AVX512BW-NEXT: orq %r13, %r10 +; AVX512BW-NEXT: movzbl %r15b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $6, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $7, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $8, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $9, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $10, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $3, %k0, %k1 +; AVX512BW-NEXT: shlq $11, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movzbl %r13b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $12, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $13, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $14, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $15, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $16, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $4, %k0, %k1 +; AVX512BW-NEXT: shlq $17, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movzbl %r12b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $18, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $19, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $20, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $21, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $22, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $10, %k0, %k1 +; AVX512BW-NEXT: shlq $23, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movzbl %r15b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $24, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $25, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $26, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $27, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $28, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: shlq $29, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movzbl %r9b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $30, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $6, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %r13 +; AVX512BW-NEXT: shlq $32, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r9, %r12 +; AVX512BW-NEXT: shlq $33, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r9, %r13 +; AVX512BW-NEXT: shlq $34, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $7, %k0, %k1 +; AVX512BW-NEXT: shlq $35, %r9 +; AVX512BW-NEXT: orq %r13, %r9 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $36, %r13 +; AVX512BW-NEXT: orq %r9, %r13 +; AVX512BW-NEXT: movq %r12, %r9 +; AVX512BW-NEXT: shlq $37, %r9 +; AVX512BW-NEXT: orq %r13, %r9 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $38, %r13 +; AVX512BW-NEXT: orq %r9, %r13 +; AVX512BW-NEXT: movq %r12, %r9 +; AVX512BW-NEXT: shlq $39, %r9 +; AVX512BW-NEXT: orq %r13, %r9 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $40, %r13 +; AVX512BW-NEXT: orq %r9, %r13 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrq $8, %k0, %k1 +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %r13 +; AVX512BW-NEXT: shlq $42, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r9, %r12 +; AVX512BW-NEXT: shlq $43, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r9, %r13 +; AVX512BW-NEXT: shlq $44, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r9, %r12 +; AVX512BW-NEXT: shlq $45, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r9, %r13 +; AVX512BW-NEXT: shlq $46, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $9, %k0, %k1 +; AVX512BW-NEXT: shlq $47, %r9 +; AVX512BW-NEXT: orq %r13, %r9 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $48, %r13 +; AVX512BW-NEXT: orq %r9, %r13 +; AVX512BW-NEXT: movq %r12, %r9 +; AVX512BW-NEXT: shlq $49, %r9 +; AVX512BW-NEXT: orq %r13, %r9 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $50, %r13 +; AVX512BW-NEXT: orq %r9, %r13 +; AVX512BW-NEXT: movq %r12, %r9 +; AVX512BW-NEXT: shlq $51, %r9 +; AVX512BW-NEXT: orq %r13, %r9 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $52, %r13 +; AVX512BW-NEXT: orq %r9, %r13 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrq $43, %k0, %k1 +; AVX512BW-NEXT: shlq $53, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %r13 +; AVX512BW-NEXT: shlq $54, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r9, %r12 +; AVX512BW-NEXT: shlq $55, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r9, %r13 +; AVX512BW-NEXT: shlq $56, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r9, %r12 +; AVX512BW-NEXT: shlq $57, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r9, %r13 +; AVX512BW-NEXT: shlq $58, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: shlq $59, %r9 +; AVX512BW-NEXT: orq %r13, %r9 +; AVX512BW-NEXT: movl %r15d, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $60, %r13 +; AVX512BW-NEXT: orq %r9, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $61, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $45, %k0, %k1 +; AVX512BW-NEXT: movzbl %r15b, %r9d +; AVX512BW-NEXT: shlq $62, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $63, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movq %rax, (%rsi) +; AVX512BW-NEXT: movzbl %r13b, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movzbl %r8b, %r10d +; AVX512BW-NEXT: movl %r10d, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %rax +; AVX512BW-NEXT: leaq (%rax,%r13,4), %rax +; AVX512BW-NEXT: leaq (%rax,%r13,8), %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $4, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $5, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: shlq $7, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $9, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $11, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $46, %k0, %k1 +; AVX512BW-NEXT: shlq $13, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $15, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $17, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $18, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $47, %k0, %k1 +; AVX512BW-NEXT: shlq $19, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $20, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $21, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $22, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $23, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $24, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $48, %k0, %k1 +; AVX512BW-NEXT: shlq $25, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $26, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $27, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $28, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $29, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $49, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $33, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $35, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $50, %k0, %k1 +; AVX512BW-NEXT: shlq $37, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $39, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $51, %k0, %k1 +; AVX512BW-NEXT: shlq $43, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $45, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $47, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $52, %k0, %k1 +; AVX512BW-NEXT: shlq $49, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $51, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $53, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $53, %k0, %k1 +; AVX512BW-NEXT: shlq $55, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $57, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $59, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $54, %k0, %k1 +; AVX512BW-NEXT: shlq $61, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %bpl, %eax +; AVX512BW-NEXT: movl %ebp, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: shlq $62, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $63, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $55, %k0, %k1 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movq %r12, 32(%rsi) +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %r15 +; AVX512BW-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512BW-NEXT: leaq (%r15,%rax,8), %rax +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $4, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $5, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: shlq $9, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $11, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $13, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $57, %k0, %k1 +; AVX512BW-NEXT: shlq $15, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $17, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $18, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $19, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $20, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $63, %k0, %k1 +; AVX512BW-NEXT: shlq $21, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $22, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $23, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $24, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $25, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $26, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: shlq $27, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %r14b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $28, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $29, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $30, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $59, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %r13 +; AVX512BW-NEXT: shlq $32, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 +; AVX512BW-NEXT: shlq $33, %r14 +; AVX512BW-NEXT: orq %r13, %r14 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $35, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $37, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrq $61, %k0, %k1 +; AVX512BW-NEXT: shlq $39, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $43, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $62, %k0, %k1 +; AVX512BW-NEXT: shlq $45, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $47, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $49, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrq $21, %k0, %k1 +; AVX512BW-NEXT: shlq $51, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $53, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $55, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: shlq $57, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movl %ebp, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $58, %r12 +; AVX512BW-NEXT: orq %r14, %r12 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: shlq $59, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $60, %r12 +; AVX512BW-NEXT: orq %r14, %r12 +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $61, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %r14d +; AVX512BW-NEXT: kshiftrq $23, %k0, %k1 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: shlq $62, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: shlq $63, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movq %r12, 40(%rsi) +; AVX512BW-NEXT: movzbl %r14b, %r15d +; AVX512BW-NEXT: movl %r15d, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %r12 +; AVX512BW-NEXT: leaq (%r12,%rax,4), %r12 +; AVX512BW-NEXT: leaq (%r12,%rax,8), %rax +; AVX512BW-NEXT: movzbl %bl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $4, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $5, %rbx +; AVX512BW-NEXT: orq %r12, %rbx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $24, %k0, %k1 +; AVX512BW-NEXT: shlq $9, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $11, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $13, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $25, %k0, %k1 +; AVX512BW-NEXT: shlq $15, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $17, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $18, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $19, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $20, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $31, %k0, %k1 +; AVX512BW-NEXT: shlq $21, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $22, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $23, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $24, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $25, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $26, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: shlq $27, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %r11b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $28, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $29, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $30, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $27, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %r13 +; AVX512BW-NEXT: shlq $32, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $28, %k0, %k1 +; AVX512BW-NEXT: shlq $33, %r11 +; AVX512BW-NEXT: orq %r13, %r11 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r12, %r11 +; AVX512BW-NEXT: shlq $35, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r12, %r11 +; AVX512BW-NEXT: shlq $37, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrq $29, %k0, %k1 +; AVX512BW-NEXT: shlq $39, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r11, %r12 +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r11, %r12 +; AVX512BW-NEXT: shlq $43, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $30, %k0, %k1 +; AVX512BW-NEXT: shlq $45, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r12, %r11 +; AVX512BW-NEXT: shlq $47, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r12, %r11 +; AVX512BW-NEXT: shlq $49, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrq $11, %k0, %k1 +; AVX512BW-NEXT: shlq $51, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r11, %r12 +; AVX512BW-NEXT: shlq $53, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r11, %r12 +; AVX512BW-NEXT: shlq $55, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: shlq $57, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movl %ebp, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $58, %r12 +; AVX512BW-NEXT: orq %r11, %r12 +; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: shlq $59, %r11 +; AVX512BW-NEXT: orq %r12, %r11 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $60, %r12 +; AVX512BW-NEXT: orq %r11, %r12 +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $61, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrq $33, %k0, %k1 +; AVX512BW-NEXT: shlq $62, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $32, %k0, %k1 +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: shlq $63, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $34, %k0, %k1 +; AVX512BW-NEXT: orq %rbx, %r13 +; AVX512BW-NEXT: movq %r13, 16(%rsi) +; AVX512BW-NEXT: movzbl %al, %ebx +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: leaq (%rbx,%rbx,2), %rax +; AVX512BW-NEXT: leaq (%rax,%rbx,4), %rax +; AVX512BW-NEXT: leaq (%rax,%rbx,8), %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $4, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: shlq $5, %rbx +; AVX512BW-NEXT: orq %r13, %rbx +; AVX512BW-NEXT: movzbl %r12b, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $9, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $35, %k0, %k1 +; AVX512BW-NEXT: shlq $11, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $13, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $15, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $36, %k0, %k1 +; AVX512BW-NEXT: shlq $17, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $18, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $19, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $20, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $21, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $22, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $37, %k0, %k1 +; AVX512BW-NEXT: shlq $23, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $24, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $25, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $26, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $27, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $28, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $38, %k0, %k1 +; AVX512BW-NEXT: shlq $29, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $31, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $33, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $39, %k0, %k1 +; AVX512BW-NEXT: shlq $35, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $37, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $39, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $40, %k0, %k1 +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $43, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $45, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $41, %k0, %k1 +; AVX512BW-NEXT: shlq $47, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbp, %r12 +; AVX512BW-NEXT: shlq $49, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %rbp, %r12 +; AVX512BW-NEXT: shlq $51, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $16, %k0, %k1 +; AVX512BW-NEXT: shlq $53, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $55, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %rbp +; AVX512BW-NEXT: shlq $57, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $58, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $17, %k0, %k1 +; AVX512BW-NEXT: shlq $59, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r8, %r13 +; AVX512BW-NEXT: shlq $61, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $18, %k0, %k1 +; AVX512BW-NEXT: shlq $62, %r8 +; AVX512BW-NEXT: orq %r13, %r8 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $19, %k0, %k1 +; AVX512BW-NEXT: shlq $63, %r10 +; AVX512BW-NEXT: orq %r8, %r10 +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrq $20, %k0, %k0 +; AVX512BW-NEXT: orq %rbx, %r10 +; AVX512BW-NEXT: kmovd %k0, %ebx +; AVX512BW-NEXT: movq %r10, 24(%rsi) +; AVX512BW-NEXT: movzbl %r11b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: leaq (%r9,%r9,2), %r9 +; AVX512BW-NEXT: leaq (%r9,%rax,4), %r9 +; AVX512BW-NEXT: leaq (%r9,%rax,8), %r9 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: shlq $4, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $5, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: shlq $6, %r10 +; AVX512BW-NEXT: shlq $7, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movzbl %dil, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r10 +; AVX512BW-NEXT: shlq $8, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $9, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movq %rdi, %r10 +; AVX512BW-NEXT: shlq $10, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $11, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movq %rdi, %r10 +; AVX512BW-NEXT: shlq $12, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: shlq $13, %rdi +; AVX512BW-NEXT: orq %r10, %rdi +; AVX512BW-NEXT: movzbl %dl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $14, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $15, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $16, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $17, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $18, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: shlq $19, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movzbl %cl, %ecx +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $20, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shlq $21, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $22, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shlq $23, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $24, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: shlq $25, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $26, %rdx +; AVX512BW-NEXT: orq %rcx, %rdx +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $27, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $28, %rdx +; AVX512BW-NEXT: orq %rcx, %rdx +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $29, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $30, %rdx +; AVX512BW-NEXT: orq %rcx, %rdx +; AVX512BW-NEXT: shlq $31, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movq %rbp, %rcx +; AVX512BW-NEXT: shlq $32, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $33, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %rbp, %rcx +; AVX512BW-NEXT: shlq $34, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $35, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %rbp, %rcx +; AVX512BW-NEXT: shlq $36, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $37, %rbp +; AVX512BW-NEXT: orq %rcx, %rbp +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $39, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $41, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: shlq $43, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rcx +; AVX512BW-NEXT: shlq $45, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r13, %rcx +; AVX512BW-NEXT: shlq $47, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: shlq $49, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r8, %rcx +; AVX512BW-NEXT: shlq $51, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r8, %rcx +; AVX512BW-NEXT: shlq $53, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: shlq $55, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rbx, %rcx +; AVX512BW-NEXT: shlq $57, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %rbx, %rcx +; AVX512BW-NEXT: shlq $59, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: shlq $61, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: shlq $62, %r14 +; AVX512BW-NEXT: orq %rbx, %r14 +; AVX512BW-NEXT: shlq $63, %r15 +; AVX512BW-NEXT: orq %r14, %r15 +; AVX512BW-NEXT: orq %r9, %r15 +; AVX512BW-NEXT: movq %r15, 8(%rsi) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r13 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: retq + %src.vec = load <64 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <384 x i32> + store <384 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor7_vf2(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor7_vf2: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kshiftrw $1, %k1, %k0 +; AVX512F-ONLY-NEXT: kmovw %k1, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: leal (%rdx,%rdx,2), %eax +; AVX512F-ONLY-NEXT: leal (%rax,%rdx,4), %eax +; AVX512F-ONLY-NEXT: leal (%rax,%rdx,8), %eax +; AVX512F-ONLY-NEXT: movl %edx, %ecx +; AVX512F-ONLY-NEXT: shll $4, %ecx +; AVX512F-ONLY-NEXT: orl %eax, %ecx +; AVX512F-ONLY-NEXT: movl %edx, %eax +; AVX512F-ONLY-NEXT: shll $5, %eax +; AVX512F-ONLY-NEXT: orl %ecx, %eax +; AVX512F-ONLY-NEXT: shll $6, %edx +; AVX512F-ONLY-NEXT: kmovw %k0, %ecx +; AVX512F-ONLY-NEXT: movl %ecx, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movl %edi, %r8d +; AVX512F-ONLY-NEXT: shll $7, %r8d +; AVX512F-ONLY-NEXT: orl %edx, %r8d +; AVX512F-ONLY-NEXT: movl %edi, %edx +; AVX512F-ONLY-NEXT: shll $8, %edx +; AVX512F-ONLY-NEXT: orl %r8d, %edx +; AVX512F-ONLY-NEXT: movl %edi, %r8d +; AVX512F-ONLY-NEXT: shll $9, %r8d +; AVX512F-ONLY-NEXT: orl %edx, %r8d +; AVX512F-ONLY-NEXT: movl %edi, %edx +; AVX512F-ONLY-NEXT: shll $10, %edx +; AVX512F-ONLY-NEXT: orl %r8d, %edx +; AVX512F-ONLY-NEXT: movl %edi, %r8d +; AVX512F-ONLY-NEXT: shll $11, %r8d +; AVX512F-ONLY-NEXT: orl %edx, %r8d +; AVX512F-ONLY-NEXT: shll $12, %edi +; AVX512F-ONLY-NEXT: orl %r8d, %edi +; AVX512F-ONLY-NEXT: shll $13, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: orl %eax, %ecx +; AVX512F-ONLY-NEXT: andl $16383, %ecx # imm = 0x3FFF +; AVX512F-ONLY-NEXT: movw %cx, (%rsi) +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor7_vf2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k1 +; AVX512DQ-NEXT: kshiftrb $1, %k1, %k0 +; AVX512DQ-NEXT: kmovw %k1, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: leal (%rdx,%rdx,2), %eax +; AVX512DQ-NEXT: leal (%rax,%rdx,4), %eax +; AVX512DQ-NEXT: leal (%rax,%rdx,8), %eax +; AVX512DQ-NEXT: movl %edx, %ecx +; AVX512DQ-NEXT: shll $4, %ecx +; AVX512DQ-NEXT: orl %eax, %ecx +; AVX512DQ-NEXT: movl %edx, %eax +; AVX512DQ-NEXT: shll $5, %eax +; AVX512DQ-NEXT: orl %ecx, %eax +; AVX512DQ-NEXT: shll $6, %edx +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: movl %ecx, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movl %edi, %r8d +; AVX512DQ-NEXT: shll $7, %r8d +; AVX512DQ-NEXT: orl %edx, %r8d +; AVX512DQ-NEXT: movl %edi, %edx +; AVX512DQ-NEXT: shll $8, %edx +; AVX512DQ-NEXT: orl %r8d, %edx +; AVX512DQ-NEXT: movl %edi, %r8d +; AVX512DQ-NEXT: shll $9, %r8d +; AVX512DQ-NEXT: orl %edx, %r8d +; AVX512DQ-NEXT: movl %edi, %edx +; AVX512DQ-NEXT: shll $10, %edx +; AVX512DQ-NEXT: orl %r8d, %edx +; AVX512DQ-NEXT: movl %edi, %r8d +; AVX512DQ-NEXT: shll $11, %r8d +; AVX512DQ-NEXT: orl %edx, %r8d +; AVX512DQ-NEXT: shll $12, %edi +; AVX512DQ-NEXT: orl %r8d, %edi +; AVX512DQ-NEXT: shll $13, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: orl %eax, %ecx +; AVX512DQ-NEXT: andl $16383, %ecx # imm = 0x3FFF +; AVX512DQ-NEXT: movw %cx, (%rsi) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor7_vf2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k0 +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: leal (%rdx,%rdx,2), %eax +; AVX512BW-NEXT: leal (%rax,%rdx,4), %eax +; AVX512BW-NEXT: leal (%rax,%rdx,8), %eax +; AVX512BW-NEXT: movl %edx, %ecx +; AVX512BW-NEXT: shll $4, %ecx +; AVX512BW-NEXT: orl %eax, %ecx +; AVX512BW-NEXT: movl %edx, %eax +; AVX512BW-NEXT: shll $5, %eax +; AVX512BW-NEXT: orl %ecx, %eax +; AVX512BW-NEXT: shll $6, %edx +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: movl %ecx, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movl %edi, %r8d +; AVX512BW-NEXT: shll $7, %r8d +; AVX512BW-NEXT: orl %edx, %r8d +; AVX512BW-NEXT: movl %edi, %edx +; AVX512BW-NEXT: shll $8, %edx +; AVX512BW-NEXT: orl %r8d, %edx +; AVX512BW-NEXT: movl %edi, %r8d +; AVX512BW-NEXT: shll $9, %r8d +; AVX512BW-NEXT: orl %edx, %r8d +; AVX512BW-NEXT: movl %edi, %edx +; AVX512BW-NEXT: shll $10, %edx +; AVX512BW-NEXT: orl %r8d, %edx +; AVX512BW-NEXT: movl %edi, %r8d +; AVX512BW-NEXT: shll $11, %r8d +; AVX512BW-NEXT: orl %edx, %r8d +; AVX512BW-NEXT: shll $12, %edi +; AVX512BW-NEXT: orl %r8d, %edi +; AVX512BW-NEXT: shll $13, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: orl %eax, %ecx +; AVX512BW-NEXT: andl $16383, %ecx # imm = 0x3FFF +; AVX512BW-NEXT: movw %cx, (%rsi) +; AVX512BW-NEXT: retq + %src.vec = load <2 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <14 x i32> + store <14 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor7_vf4(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor7_vf4: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3 +; AVX512F-ONLY-NEXT: kshiftrw $3, %k3, %k0 +; AVX512F-ONLY-NEXT: kshiftrw $2, %k3, %k1 +; AVX512F-ONLY-NEXT: kshiftrw $1, %k3, %k2 +; AVX512F-ONLY-NEXT: kmovw %k3, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: leal (%rdx,%rdx,2), %eax +; AVX512F-ONLY-NEXT: leal (%rax,%rdx,4), %eax +; AVX512F-ONLY-NEXT: leal (%rax,%rdx,8), %eax +; AVX512F-ONLY-NEXT: movl %edx, %ecx +; AVX512F-ONLY-NEXT: shll $4, %ecx +; AVX512F-ONLY-NEXT: orl %eax, %ecx +; AVX512F-ONLY-NEXT: movl %edx, %eax +; AVX512F-ONLY-NEXT: shll $5, %eax +; AVX512F-ONLY-NEXT: orl %ecx, %eax +; AVX512F-ONLY-NEXT: shll $6, %edx +; AVX512F-ONLY-NEXT: kmovw %k2, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movl %ecx, %edi +; AVX512F-ONLY-NEXT: shll $7, %edi +; AVX512F-ONLY-NEXT: orl %edx, %edi +; AVX512F-ONLY-NEXT: movl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $8, %edx +; AVX512F-ONLY-NEXT: orl %edi, %edx +; AVX512F-ONLY-NEXT: movl %ecx, %edi +; AVX512F-ONLY-NEXT: shll $9, %edi +; AVX512F-ONLY-NEXT: orl %edx, %edi +; AVX512F-ONLY-NEXT: movl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $10, %edx +; AVX512F-ONLY-NEXT: orl %edi, %edx +; AVX512F-ONLY-NEXT: movl %ecx, %edi +; AVX512F-ONLY-NEXT: shll $11, %edi +; AVX512F-ONLY-NEXT: orl %edx, %edi +; AVX512F-ONLY-NEXT: movl %ecx, %edx +; AVX512F-ONLY-NEXT: shll $12, %edx +; AVX512F-ONLY-NEXT: orl %edi, %edx +; AVX512F-ONLY-NEXT: shll $13, %ecx +; AVX512F-ONLY-NEXT: orl %edx, %ecx +; AVX512F-ONLY-NEXT: kmovw %k1, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: shll $14, %edi +; AVX512F-ONLY-NEXT: orl %ecx, %edi +; AVX512F-ONLY-NEXT: movl %edx, %ecx +; AVX512F-ONLY-NEXT: shll $15, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: shll $16, %edi +; AVX512F-ONLY-NEXT: orl %ecx, %edi +; AVX512F-ONLY-NEXT: movl %edx, %ecx +; AVX512F-ONLY-NEXT: shll $17, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: movl %edx, %edi +; AVX512F-ONLY-NEXT: shll $18, %edi +; AVX512F-ONLY-NEXT: orl %ecx, %edi +; AVX512F-ONLY-NEXT: movl %edx, %ecx +; AVX512F-ONLY-NEXT: shll $19, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: shll $20, %edx +; AVX512F-ONLY-NEXT: orl %ecx, %edx +; AVX512F-ONLY-NEXT: kmovw %k0, %ecx +; AVX512F-ONLY-NEXT: movl %ecx, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movl %edi, %r8d +; AVX512F-ONLY-NEXT: shll $21, %r8d +; AVX512F-ONLY-NEXT: orl %edx, %r8d +; AVX512F-ONLY-NEXT: movl %edi, %edx +; AVX512F-ONLY-NEXT: shll $22, %edx +; AVX512F-ONLY-NEXT: orl %r8d, %edx +; AVX512F-ONLY-NEXT: movl %edi, %r8d +; AVX512F-ONLY-NEXT: shll $23, %r8d +; AVX512F-ONLY-NEXT: orl %edx, %r8d +; AVX512F-ONLY-NEXT: movl %edi, %edx +; AVX512F-ONLY-NEXT: shll $24, %edx +; AVX512F-ONLY-NEXT: orl %r8d, %edx +; AVX512F-ONLY-NEXT: movl %edi, %r8d +; AVX512F-ONLY-NEXT: shll $25, %r8d +; AVX512F-ONLY-NEXT: orl %edx, %r8d +; AVX512F-ONLY-NEXT: shll $26, %edi +; AVX512F-ONLY-NEXT: orl %r8d, %edi +; AVX512F-ONLY-NEXT: shll $27, %ecx +; AVX512F-ONLY-NEXT: orl %edi, %ecx +; AVX512F-ONLY-NEXT: orl %eax, %ecx +; AVX512F-ONLY-NEXT: andl $268435455, %ecx # imm = 0xFFFFFFF +; AVX512F-ONLY-NEXT: movl %ecx, (%rsi) +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor7_vf4: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k3 +; AVX512DQ-NEXT: kshiftrb $3, %k3, %k0 +; AVX512DQ-NEXT: kshiftrb $2, %k3, %k1 +; AVX512DQ-NEXT: kshiftrb $1, %k3, %k2 +; AVX512DQ-NEXT: kmovw %k3, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: leal (%rdx,%rdx,2), %eax +; AVX512DQ-NEXT: leal (%rax,%rdx,4), %eax +; AVX512DQ-NEXT: leal (%rax,%rdx,8), %eax +; AVX512DQ-NEXT: movl %edx, %ecx +; AVX512DQ-NEXT: shll $4, %ecx +; AVX512DQ-NEXT: orl %eax, %ecx +; AVX512DQ-NEXT: movl %edx, %eax +; AVX512DQ-NEXT: shll $5, %eax +; AVX512DQ-NEXT: orl %ecx, %eax +; AVX512DQ-NEXT: shll $6, %edx +; AVX512DQ-NEXT: kmovw %k2, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movl %ecx, %edi +; AVX512DQ-NEXT: shll $7, %edi +; AVX512DQ-NEXT: orl %edx, %edi +; AVX512DQ-NEXT: movl %ecx, %edx +; AVX512DQ-NEXT: shll $8, %edx +; AVX512DQ-NEXT: orl %edi, %edx +; AVX512DQ-NEXT: movl %ecx, %edi +; AVX512DQ-NEXT: shll $9, %edi +; AVX512DQ-NEXT: orl %edx, %edi +; AVX512DQ-NEXT: movl %ecx, %edx +; AVX512DQ-NEXT: shll $10, %edx +; AVX512DQ-NEXT: orl %edi, %edx +; AVX512DQ-NEXT: movl %ecx, %edi +; AVX512DQ-NEXT: shll $11, %edi +; AVX512DQ-NEXT: orl %edx, %edi +; AVX512DQ-NEXT: movl %ecx, %edx +; AVX512DQ-NEXT: shll $12, %edx +; AVX512DQ-NEXT: orl %edi, %edx +; AVX512DQ-NEXT: shll $13, %ecx +; AVX512DQ-NEXT: orl %edx, %ecx +; AVX512DQ-NEXT: kmovw %k1, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: shll $14, %edi +; AVX512DQ-NEXT: orl %ecx, %edi +; AVX512DQ-NEXT: movl %edx, %ecx +; AVX512DQ-NEXT: shll $15, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: shll $16, %edi +; AVX512DQ-NEXT: orl %ecx, %edi +; AVX512DQ-NEXT: movl %edx, %ecx +; AVX512DQ-NEXT: shll $17, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: movl %edx, %edi +; AVX512DQ-NEXT: shll $18, %edi +; AVX512DQ-NEXT: orl %ecx, %edi +; AVX512DQ-NEXT: movl %edx, %ecx +; AVX512DQ-NEXT: shll $19, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: shll $20, %edx +; AVX512DQ-NEXT: orl %ecx, %edx +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: movl %ecx, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movl %edi, %r8d +; AVX512DQ-NEXT: shll $21, %r8d +; AVX512DQ-NEXT: orl %edx, %r8d +; AVX512DQ-NEXT: movl %edi, %edx +; AVX512DQ-NEXT: shll $22, %edx +; AVX512DQ-NEXT: orl %r8d, %edx +; AVX512DQ-NEXT: movl %edi, %r8d +; AVX512DQ-NEXT: shll $23, %r8d +; AVX512DQ-NEXT: orl %edx, %r8d +; AVX512DQ-NEXT: movl %edi, %edx +; AVX512DQ-NEXT: shll $24, %edx +; AVX512DQ-NEXT: orl %r8d, %edx +; AVX512DQ-NEXT: movl %edi, %r8d +; AVX512DQ-NEXT: shll $25, %r8d +; AVX512DQ-NEXT: orl %edx, %r8d +; AVX512DQ-NEXT: shll $26, %edi +; AVX512DQ-NEXT: orl %r8d, %edi +; AVX512DQ-NEXT: shll $27, %ecx +; AVX512DQ-NEXT: orl %edi, %ecx +; AVX512DQ-NEXT: orl %eax, %ecx +; AVX512DQ-NEXT: andl $268435455, %ecx # imm = 0xFFFFFFF +; AVX512DQ-NEXT: movl %ecx, (%rsi) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor7_vf4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k3 +; AVX512BW-NEXT: kshiftrw $3, %k3, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k3, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k3, %k2 +; AVX512BW-NEXT: kmovd %k3, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: leal (%rdx,%rdx,2), %eax +; AVX512BW-NEXT: leal (%rax,%rdx,4), %eax +; AVX512BW-NEXT: leal (%rax,%rdx,8), %eax +; AVX512BW-NEXT: movl %edx, %ecx +; AVX512BW-NEXT: shll $4, %ecx +; AVX512BW-NEXT: orl %eax, %ecx +; AVX512BW-NEXT: movl %edx, %eax +; AVX512BW-NEXT: shll $5, %eax +; AVX512BW-NEXT: orl %ecx, %eax +; AVX512BW-NEXT: shll $6, %edx +; AVX512BW-NEXT: kmovd %k2, %ecx +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movl %ecx, %edi +; AVX512BW-NEXT: shll $7, %edi +; AVX512BW-NEXT: orl %edx, %edi +; AVX512BW-NEXT: movl %ecx, %edx +; AVX512BW-NEXT: shll $8, %edx +; AVX512BW-NEXT: orl %edi, %edx +; AVX512BW-NEXT: movl %ecx, %edi +; AVX512BW-NEXT: shll $9, %edi +; AVX512BW-NEXT: orl %edx, %edi +; AVX512BW-NEXT: movl %ecx, %edx +; AVX512BW-NEXT: shll $10, %edx +; AVX512BW-NEXT: orl %edi, %edx +; AVX512BW-NEXT: movl %ecx, %edi +; AVX512BW-NEXT: shll $11, %edi +; AVX512BW-NEXT: orl %edx, %edi +; AVX512BW-NEXT: movl %ecx, %edx +; AVX512BW-NEXT: shll $12, %edx +; AVX512BW-NEXT: orl %edi, %edx +; AVX512BW-NEXT: shll $13, %ecx +; AVX512BW-NEXT: orl %edx, %ecx +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: shll $14, %edi +; AVX512BW-NEXT: orl %ecx, %edi +; AVX512BW-NEXT: movl %edx, %ecx +; AVX512BW-NEXT: shll $15, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: shll $16, %edi +; AVX512BW-NEXT: orl %ecx, %edi +; AVX512BW-NEXT: movl %edx, %ecx +; AVX512BW-NEXT: shll $17, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: movl %edx, %edi +; AVX512BW-NEXT: shll $18, %edi +; AVX512BW-NEXT: orl %ecx, %edi +; AVX512BW-NEXT: movl %edx, %ecx +; AVX512BW-NEXT: shll $19, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: shll $20, %edx +; AVX512BW-NEXT: orl %ecx, %edx +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: movl %ecx, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movl %edi, %r8d +; AVX512BW-NEXT: shll $21, %r8d +; AVX512BW-NEXT: orl %edx, %r8d +; AVX512BW-NEXT: movl %edi, %edx +; AVX512BW-NEXT: shll $22, %edx +; AVX512BW-NEXT: orl %r8d, %edx +; AVX512BW-NEXT: movl %edi, %r8d +; AVX512BW-NEXT: shll $23, %r8d +; AVX512BW-NEXT: orl %edx, %r8d +; AVX512BW-NEXT: movl %edi, %edx +; AVX512BW-NEXT: shll $24, %edx +; AVX512BW-NEXT: orl %r8d, %edx +; AVX512BW-NEXT: movl %edi, %r8d +; AVX512BW-NEXT: shll $25, %r8d +; AVX512BW-NEXT: orl %edx, %r8d +; AVX512BW-NEXT: shll $26, %edi +; AVX512BW-NEXT: orl %r8d, %edi +; AVX512BW-NEXT: shll $27, %ecx +; AVX512BW-NEXT: orl %edi, %ecx +; AVX512BW-NEXT: orl %eax, %ecx +; AVX512BW-NEXT: andl $268435455, %ecx # imm = 0xFFFFFFF +; AVX512BW-NEXT: movl %ecx, (%rsi) +; AVX512BW-NEXT: retq + %src.vec = load <4 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <28 x i32> + store <28 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor7_vf8(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor7_vf8: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k2 +; AVX512F-ONLY-NEXT: kshiftrw $7, %k2, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, %eax +; AVX512F-ONLY-NEXT: kshiftrw $6, %k2, %k0 +; AVX512F-ONLY-NEXT: kshiftrw $5, %k2, %k1 +; AVX512F-ONLY-NEXT: kshiftrw $4, %k2, %k3 +; AVX512F-ONLY-NEXT: kmovw %k3, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $3, %k2, %k3 +; AVX512F-ONLY-NEXT: kmovw %k3, %edx +; AVX512F-ONLY-NEXT: kshiftrw $2, %k2, %k3 +; AVX512F-ONLY-NEXT: kmovw %k3, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k2, %k3 +; AVX512F-ONLY-NEXT: kmovw %k3, %r9d +; AVX512F-ONLY-NEXT: kmovw %k2, %edi +; AVX512F-ONLY-NEXT: movzbl %dil, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: leaq (%r10,%r10,2), %rdi +; AVX512F-ONLY-NEXT: leaq (%rdi,%r10,4), %rdi +; AVX512F-ONLY-NEXT: leaq (%rdi,%r10,8), %rdi +; AVX512F-ONLY-NEXT: movq %r10, %r11 +; AVX512F-ONLY-NEXT: shlq $4, %r11 +; AVX512F-ONLY-NEXT: orq %rdi, %r11 +; AVX512F-ONLY-NEXT: movq %r10, %rdi +; AVX512F-ONLY-NEXT: shlq $5, %rdi +; AVX512F-ONLY-NEXT: orq %r11, %rdi +; AVX512F-ONLY-NEXT: shlq $6, %r10 +; AVX512F-ONLY-NEXT: movzbl %r9b, %r9d +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %r11 +; AVX512F-ONLY-NEXT: shlq $7, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $8, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: movq %r9, %r11 +; AVX512F-ONLY-NEXT: shlq $9, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $10, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: movq %r9, %r11 +; AVX512F-ONLY-NEXT: shlq $11, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $12, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: shlq $13, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movzbl %r8b, %r8d +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %r10 +; AVX512F-ONLY-NEXT: shlq $14, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $15, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movq %r8, %r10 +; AVX512F-ONLY-NEXT: shlq $16, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $17, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movq %r8, %r10 +; AVX512F-ONLY-NEXT: shlq $18, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $19, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: shlq $20, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: movzbl %dl, %r9d +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %rdx +; AVX512F-ONLY-NEXT: shlq $21, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: movq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $22, %r8 +; AVX512F-ONLY-NEXT: orq %rdx, %r8 +; AVX512F-ONLY-NEXT: movq %r9, %rdx +; AVX512F-ONLY-NEXT: shlq $23, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: movq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $24, %r8 +; AVX512F-ONLY-NEXT: orq %rdx, %r8 +; AVX512F-ONLY-NEXT: movq %r9, %rdx +; AVX512F-ONLY-NEXT: shlq $25, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: movq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $26, %r8 +; AVX512F-ONLY-NEXT: orq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $27, %r9 +; AVX512F-ONLY-NEXT: orq %r8, %r9 +; AVX512F-ONLY-NEXT: movzbl %cl, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $28, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: movq %rdx, %r9 +; AVX512F-ONLY-NEXT: shlq $29, %r9 +; AVX512F-ONLY-NEXT: orq %r8, %r9 +; AVX512F-ONLY-NEXT: movq %rdx, %r8 +; AVX512F-ONLY-NEXT: shlq $30, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: shlq $31, %rdx +; AVX512F-ONLY-NEXT: orq %r8, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movl %edx, (%rsi) +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $32, %rdi +; AVX512F-ONLY-NEXT: movq %rcx, %r8 +; AVX512F-ONLY-NEXT: shlq $33, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $34, %rcx +; AVX512F-ONLY-NEXT: orq %r8, %rcx +; AVX512F-ONLY-NEXT: kmovw %k1, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $35, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rcx +; AVX512F-ONLY-NEXT: shlq $36, %rcx +; AVX512F-ONLY-NEXT: orq %r8, %rcx +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $37, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rcx +; AVX512F-ONLY-NEXT: shlq $38, %rcx +; AVX512F-ONLY-NEXT: orq %r8, %rcx +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $39, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rcx +; AVX512F-ONLY-NEXT: shlq $40, %rcx +; AVX512F-ONLY-NEXT: orq %r8, %rcx +; AVX512F-ONLY-NEXT: shlq $41, %rdi +; AVX512F-ONLY-NEXT: orq %rcx, %rdi +; AVX512F-ONLY-NEXT: kmovw %k0, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %r8 +; AVX512F-ONLY-NEXT: shlq $42, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $43, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %rcx, %r8 +; AVX512F-ONLY-NEXT: shlq $44, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $45, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: movq %rcx, %r8 +; AVX512F-ONLY-NEXT: shlq $46, %r8 +; AVX512F-ONLY-NEXT: orq %rdi, %r8 +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $47, %rdi +; AVX512F-ONLY-NEXT: orq %r8, %rdi +; AVX512F-ONLY-NEXT: shlq $48, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: movzbl %al, %edi +; AVX512F-ONLY-NEXT: # kill: def $eax killed $eax def $rax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $49, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $50, %rcx +; AVX512F-ONLY-NEXT: orq %r8, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $51, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $52, %rcx +; AVX512F-ONLY-NEXT: orq %r8, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $53, %r8 +; AVX512F-ONLY-NEXT: orq %rcx, %r8 +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $55, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shrq $48, %rax +; AVX512F-ONLY-NEXT: movb %al, 6(%rsi) +; AVX512F-ONLY-NEXT: shrq $32, %rdi +; AVX512F-ONLY-NEXT: movw %di, 4(%rsi) +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor7_vf8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k2 +; AVX512DQ-NEXT: kshiftrb $7, %k2, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: kshiftrb $6, %k2, %k0 +; AVX512DQ-NEXT: kshiftrb $5, %k2, %k1 +; AVX512DQ-NEXT: kshiftrb $4, %k2, %k3 +; AVX512DQ-NEXT: kmovw %k3, %ecx +; AVX512DQ-NEXT: kshiftrb $3, %k2, %k3 +; AVX512DQ-NEXT: kmovw %k3, %edx +; AVX512DQ-NEXT: kshiftrb $2, %k2, %k3 +; AVX512DQ-NEXT: kmovw %k3, %r8d +; AVX512DQ-NEXT: kshiftrb $1, %k2, %k3 +; AVX512DQ-NEXT: kmovw %k3, %r9d +; AVX512DQ-NEXT: kmovw %k2, %edi +; AVX512DQ-NEXT: movzbl %dil, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: leaq (%r10,%r10,2), %rdi +; AVX512DQ-NEXT: leaq (%rdi,%r10,4), %rdi +; AVX512DQ-NEXT: leaq (%rdi,%r10,8), %rdi +; AVX512DQ-NEXT: movq %r10, %r11 +; AVX512DQ-NEXT: shlq $4, %r11 +; AVX512DQ-NEXT: orq %rdi, %r11 +; AVX512DQ-NEXT: movq %r10, %rdi +; AVX512DQ-NEXT: shlq $5, %rdi +; AVX512DQ-NEXT: orq %r11, %rdi +; AVX512DQ-NEXT: shlq $6, %r10 +; AVX512DQ-NEXT: movzbl %r9b, %r9d +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %r11 +; AVX512DQ-NEXT: shlq $7, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movq %r9, %r10 +; AVX512DQ-NEXT: shlq $8, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: movq %r9, %r11 +; AVX512DQ-NEXT: shlq $9, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movq %r9, %r10 +; AVX512DQ-NEXT: shlq $10, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: movq %r9, %r11 +; AVX512DQ-NEXT: shlq $11, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movq %r9, %r10 +; AVX512DQ-NEXT: shlq $12, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: shlq $13, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movzbl %r8b, %r8d +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %r10 +; AVX512DQ-NEXT: shlq $14, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $15, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movq %r8, %r10 +; AVX512DQ-NEXT: shlq $16, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $17, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movq %r8, %r10 +; AVX512DQ-NEXT: shlq $18, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $19, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: shlq $20, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: movzbl %dl, %r9d +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %rdx +; AVX512DQ-NEXT: shlq $21, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: movq %r9, %r8 +; AVX512DQ-NEXT: shlq $22, %r8 +; AVX512DQ-NEXT: orq %rdx, %r8 +; AVX512DQ-NEXT: movq %r9, %rdx +; AVX512DQ-NEXT: shlq $23, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: movq %r9, %r8 +; AVX512DQ-NEXT: shlq $24, %r8 +; AVX512DQ-NEXT: orq %rdx, %r8 +; AVX512DQ-NEXT: movq %r9, %rdx +; AVX512DQ-NEXT: shlq $25, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: movq %r9, %r8 +; AVX512DQ-NEXT: shlq $26, %r8 +; AVX512DQ-NEXT: orq %rdx, %r8 +; AVX512DQ-NEXT: shlq $27, %r9 +; AVX512DQ-NEXT: orq %r8, %r9 +; AVX512DQ-NEXT: movzbl %cl, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movq %rdx, %r8 +; AVX512DQ-NEXT: shlq $28, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: movq %rdx, %r9 +; AVX512DQ-NEXT: shlq $29, %r9 +; AVX512DQ-NEXT: orq %r8, %r9 +; AVX512DQ-NEXT: movq %rdx, %r8 +; AVX512DQ-NEXT: shlq $30, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: shlq $31, %rdx +; AVX512DQ-NEXT: orq %r8, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movl %edx, (%rsi) +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $32, %rdi +; AVX512DQ-NEXT: movq %rcx, %r8 +; AVX512DQ-NEXT: shlq $33, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: shlq $34, %rcx +; AVX512DQ-NEXT: orq %r8, %rcx +; AVX512DQ-NEXT: kmovw %k1, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $35, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: movq %rdi, %rcx +; AVX512DQ-NEXT: shlq $36, %rcx +; AVX512DQ-NEXT: orq %r8, %rcx +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $37, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: movq %rdi, %rcx +; AVX512DQ-NEXT: shlq $38, %rcx +; AVX512DQ-NEXT: orq %r8, %rcx +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $39, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: movq %rdi, %rcx +; AVX512DQ-NEXT: shlq $40, %rcx +; AVX512DQ-NEXT: orq %r8, %rcx +; AVX512DQ-NEXT: shlq $41, %rdi +; AVX512DQ-NEXT: orq %rcx, %rdi +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %r8 +; AVX512DQ-NEXT: shlq $42, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $43, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %rcx, %r8 +; AVX512DQ-NEXT: shlq $44, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $45, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: movq %rcx, %r8 +; AVX512DQ-NEXT: shlq $46, %r8 +; AVX512DQ-NEXT: orq %rdi, %r8 +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $47, %rdi +; AVX512DQ-NEXT: orq %r8, %rdi +; AVX512DQ-NEXT: shlq $48, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: movzbl %al, %edi +; AVX512DQ-NEXT: # kill: def $eax killed $eax def $rax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $49, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $50, %rcx +; AVX512DQ-NEXT: orq %r8, %rcx +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $51, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $52, %rcx +; AVX512DQ-NEXT: orq %r8, %rcx +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $53, %r8 +; AVX512DQ-NEXT: orq %rcx, %r8 +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: shlq $55, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shrq $48, %rax +; AVX512DQ-NEXT: movb %al, 6(%rsi) +; AVX512DQ-NEXT: shrq $32, %rdi +; AVX512DQ-NEXT: movw %di, 4(%rsi) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor7_vf8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k2 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: kshiftrw $6, %k2, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k3 +; AVX512BW-NEXT: kmovd %k3, %ecx +; AVX512BW-NEXT: kshiftrw $3, %k2, %k3 +; AVX512BW-NEXT: kmovd %k3, %edx +; AVX512BW-NEXT: kshiftrw $2, %k2, %k3 +; AVX512BW-NEXT: kmovd %k3, %r8d +; AVX512BW-NEXT: kshiftrw $1, %k2, %k3 +; AVX512BW-NEXT: kmovd %k3, %r9d +; AVX512BW-NEXT: kmovd %k2, %edi +; AVX512BW-NEXT: movzbl %dil, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: leaq (%r10,%r10,2), %rdi +; AVX512BW-NEXT: leaq (%rdi,%r10,4), %rdi +; AVX512BW-NEXT: leaq (%rdi,%r10,8), %rdi +; AVX512BW-NEXT: movq %r10, %r11 +; AVX512BW-NEXT: shlq $4, %r11 +; AVX512BW-NEXT: orq %rdi, %r11 +; AVX512BW-NEXT: movq %r10, %rdi +; AVX512BW-NEXT: shlq $5, %rdi +; AVX512BW-NEXT: orq %r11, %rdi +; AVX512BW-NEXT: shlq $6, %r10 +; AVX512BW-NEXT: movzbl %r9b, %r9d +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %r11 +; AVX512BW-NEXT: shlq $7, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: shlq $8, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: movq %r9, %r11 +; AVX512BW-NEXT: shlq $9, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: shlq $10, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: movq %r9, %r11 +; AVX512BW-NEXT: shlq $11, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: shlq $12, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: shlq $13, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movzbl %r8b, %r8d +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: movq %r8, %r10 +; AVX512BW-NEXT: shlq $14, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %r8, %r9 +; AVX512BW-NEXT: shlq $15, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movq %r8, %r10 +; AVX512BW-NEXT: shlq $16, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %r8, %r9 +; AVX512BW-NEXT: shlq $17, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movq %r8, %r10 +; AVX512BW-NEXT: shlq $18, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %r8, %r9 +; AVX512BW-NEXT: shlq $19, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: shlq $20, %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: movzbl %dl, %r9d +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %rdx +; AVX512BW-NEXT: shlq $21, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: movq %r9, %r8 +; AVX512BW-NEXT: shlq $22, %r8 +; AVX512BW-NEXT: orq %rdx, %r8 +; AVX512BW-NEXT: movq %r9, %rdx +; AVX512BW-NEXT: shlq $23, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: movq %r9, %r8 +; AVX512BW-NEXT: shlq $24, %r8 +; AVX512BW-NEXT: orq %rdx, %r8 +; AVX512BW-NEXT: movq %r9, %rdx +; AVX512BW-NEXT: shlq $25, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: movq %r9, %r8 +; AVX512BW-NEXT: shlq $26, %r8 +; AVX512BW-NEXT: orq %rdx, %r8 +; AVX512BW-NEXT: shlq $27, %r9 +; AVX512BW-NEXT: orq %r8, %r9 +; AVX512BW-NEXT: movzbl %cl, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %r8 +; AVX512BW-NEXT: shlq $28, %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: movq %rdx, %r9 +; AVX512BW-NEXT: shlq $29, %r9 +; AVX512BW-NEXT: orq %r8, %r9 +; AVX512BW-NEXT: movq %rdx, %r8 +; AVX512BW-NEXT: shlq $30, %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: shlq $31, %rdx +; AVX512BW-NEXT: orq %r8, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movl %edx, (%rsi) +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $32, %rdi +; AVX512BW-NEXT: movq %rcx, %r8 +; AVX512BW-NEXT: shlq $33, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: shlq $34, %rcx +; AVX512BW-NEXT: orq %r8, %rcx +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $35, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: movq %rdi, %rcx +; AVX512BW-NEXT: shlq $36, %rcx +; AVX512BW-NEXT: orq %r8, %rcx +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $37, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: movq %rdi, %rcx +; AVX512BW-NEXT: shlq $38, %rcx +; AVX512BW-NEXT: orq %r8, %rcx +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $39, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: movq %rdi, %rcx +; AVX512BW-NEXT: shlq $40, %rcx +; AVX512BW-NEXT: orq %r8, %rcx +; AVX512BW-NEXT: shlq $41, %rdi +; AVX512BW-NEXT: orq %rcx, %rdi +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %r8 +; AVX512BW-NEXT: shlq $42, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $43, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: movq %rcx, %r8 +; AVX512BW-NEXT: shlq $44, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $45, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: movq %rcx, %r8 +; AVX512BW-NEXT: shlq $46, %r8 +; AVX512BW-NEXT: orq %rdi, %r8 +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $47, %rdi +; AVX512BW-NEXT: orq %r8, %rdi +; AVX512BW-NEXT: shlq $48, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: movzbl %al, %edi +; AVX512BW-NEXT: # kill: def $eax killed $eax def $rax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $49, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $50, %rcx +; AVX512BW-NEXT: orq %r8, %rcx +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $51, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $52, %rcx +; AVX512BW-NEXT: orq %r8, %rcx +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $53, %r8 +; AVX512BW-NEXT: orq %rcx, %r8 +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: shlq $55, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shrq $48, %rax +; AVX512BW-NEXT: movb %al, 6(%rsi) +; AVX512BW-NEXT: shrq $32, %rdi +; AVX512BW-NEXT: movw %di, 4(%rsi) +; AVX512BW-NEXT: retq + %src.vec = load <8 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <56 x i32> + store <56 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor7_vf16(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor7_vf16: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbp +; AVX512F-ONLY-NEXT: pushq %r15 +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %r12 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4 +; AVX512F-ONLY-NEXT: kshiftrw $8, %k4, %k0 +; AVX512F-ONLY-NEXT: kshiftrw $7, %k4, %k1 +; AVX512F-ONLY-NEXT: kshiftrw $6, %k4, %k2 +; AVX512F-ONLY-NEXT: kshiftrw $5, %k4, %k3 +; AVX512F-ONLY-NEXT: kshiftrw $4, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $3, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %edx +; AVX512F-ONLY-NEXT: kshiftrw $2, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %edi +; AVX512F-ONLY-NEXT: kshiftrw $1, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $15, %k4, %k5 +; AVX512F-ONLY-NEXT: kmovw %k5, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k4, %k5 +; AVX512F-ONLY-NEXT: kshiftrw $13, %k4, %k6 +; AVX512F-ONLY-NEXT: kmovw %k6, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k4, %k6 +; AVX512F-ONLY-NEXT: kmovw %k6, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $11, %k4, %k6 +; AVX512F-ONLY-NEXT: kmovw %k6, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $10, %k4, %k6 +; AVX512F-ONLY-NEXT: kmovw %k6, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k4, %k6 +; AVX512F-ONLY-NEXT: kmovw %k6, %eax +; AVX512F-ONLY-NEXT: movzbl %al, %eax +; AVX512F-ONLY-NEXT: movl %eax, %r11d +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: leaq (%r11,%r11,2), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%r11,4), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%r11,8), %r15 +; AVX512F-ONLY-NEXT: movq %r11, %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $5, %r11 +; AVX512F-ONLY-NEXT: orq %r12, %r11 +; AVX512F-ONLY-NEXT: movzbl %r14b, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r14 +; AVX512F-ONLY-NEXT: shlq $6, %r14 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $7, %r12 +; AVX512F-ONLY-NEXT: orq %r14, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r14 +; AVX512F-ONLY-NEXT: shlq $8, %r14 +; AVX512F-ONLY-NEXT: orq %r12, %r14 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $9, %r12 +; AVX512F-ONLY-NEXT: orq %r14, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %r14 +; AVX512F-ONLY-NEXT: shlq $10, %r14 +; AVX512F-ONLY-NEXT: orq %r12, %r14 +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $11, %r12 +; AVX512F-ONLY-NEXT: orq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $12, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r14d +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $13, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movq %r14, %r15 +; AVX512F-ONLY-NEXT: shlq $14, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $15, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movq %r14, %r15 +; AVX512F-ONLY-NEXT: shlq $16, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $17, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movq %r14, %r15 +; AVX512F-ONLY-NEXT: shlq $18, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $19, %r14 +; AVX512F-ONLY-NEXT: orq %r15, %r14 +; AVX512F-ONLY-NEXT: movzbl %bl, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rbx +; AVX512F-ONLY-NEXT: shlq $20, %rbx +; AVX512F-ONLY-NEXT: orq %r14, %rbx +; AVX512F-ONLY-NEXT: movq %r15, %r14 +; AVX512F-ONLY-NEXT: shlq $21, %r14 +; AVX512F-ONLY-NEXT: orq %rbx, %r14 +; AVX512F-ONLY-NEXT: movq %r15, %rbx +; AVX512F-ONLY-NEXT: shlq $22, %rbx +; AVX512F-ONLY-NEXT: orq %r14, %rbx +; AVX512F-ONLY-NEXT: movq %r15, %r14 +; AVX512F-ONLY-NEXT: shlq $23, %r14 +; AVX512F-ONLY-NEXT: orq %rbx, %r14 +; AVX512F-ONLY-NEXT: movq %r15, %rbx +; AVX512F-ONLY-NEXT: shlq $24, %rbx +; AVX512F-ONLY-NEXT: orq %r14, %rbx +; AVX512F-ONLY-NEXT: movq %r15, %r14 +; AVX512F-ONLY-NEXT: shlq $25, %r14 +; AVX512F-ONLY-NEXT: orq %rbx, %r14 +; AVX512F-ONLY-NEXT: shlq $26, %r15 +; AVX512F-ONLY-NEXT: orq %r14, %r15 +; AVX512F-ONLY-NEXT: movzbl %r10b, %ebx +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: movq %rbx, %r14 +; AVX512F-ONLY-NEXT: shlq $27, %r14 +; AVX512F-ONLY-NEXT: orq %r15, %r14 +; AVX512F-ONLY-NEXT: movq %rbx, %r15 +; AVX512F-ONLY-NEXT: shlq $28, %r15 +; AVX512F-ONLY-NEXT: orq %r14, %r15 +; AVX512F-ONLY-NEXT: movq %rbx, %r14 +; AVX512F-ONLY-NEXT: shlq $29, %r14 +; AVX512F-ONLY-NEXT: orq %r15, %r14 +; AVX512F-ONLY-NEXT: movq %rbx, %r15 +; AVX512F-ONLY-NEXT: shlq $30, %r15 +; AVX512F-ONLY-NEXT: orq %r14, %r15 +; AVX512F-ONLY-NEXT: shlq $31, %rbx +; AVX512F-ONLY-NEXT: orq %r15, %rbx +; AVX512F-ONLY-NEXT: orq %r11, %rbx +; AVX512F-ONLY-NEXT: movl %ebx, 8(%rsi) +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %r11 +; AVX512F-ONLY-NEXT: shlq $32, %r11 +; AVX512F-ONLY-NEXT: shlq $33, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: kmovw %k5, %r11d +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: movq %r11, %r14 +; AVX512F-ONLY-NEXT: shlq $34, %r14 +; AVX512F-ONLY-NEXT: orq %r10, %r14 +; AVX512F-ONLY-NEXT: movq %r11, %r10 +; AVX512F-ONLY-NEXT: shlq $35, %r10 +; AVX512F-ONLY-NEXT: orq %r14, %r10 +; AVX512F-ONLY-NEXT: movq %r11, %r14 +; AVX512F-ONLY-NEXT: shlq $36, %r14 +; AVX512F-ONLY-NEXT: orq %r10, %r14 +; AVX512F-ONLY-NEXT: movq %r11, %r10 +; AVX512F-ONLY-NEXT: shlq $37, %r10 +; AVX512F-ONLY-NEXT: orq %r14, %r10 +; AVX512F-ONLY-NEXT: movq %r11, %r14 +; AVX512F-ONLY-NEXT: shlq $38, %r14 +; AVX512F-ONLY-NEXT: orq %r10, %r14 +; AVX512F-ONLY-NEXT: movq %r11, %r10 +; AVX512F-ONLY-NEXT: shlq $39, %r10 +; AVX512F-ONLY-NEXT: orq %r14, %r10 +; AVX512F-ONLY-NEXT: shlq $40, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movzbl %r8b, %r10d +; AVX512F-ONLY-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %r14 +; AVX512F-ONLY-NEXT: shlq $41, %r14 +; AVX512F-ONLY-NEXT: orq %r11, %r14 +; AVX512F-ONLY-NEXT: movq %r8, %r11 +; AVX512F-ONLY-NEXT: shlq $42, %r11 +; AVX512F-ONLY-NEXT: orq %r14, %r11 +; AVX512F-ONLY-NEXT: movq %r8, %r14 +; AVX512F-ONLY-NEXT: shlq $43, %r14 +; AVX512F-ONLY-NEXT: orq %r11, %r14 +; AVX512F-ONLY-NEXT: movq %r8, %r11 +; AVX512F-ONLY-NEXT: shlq $44, %r11 +; AVX512F-ONLY-NEXT: orq %r14, %r11 +; AVX512F-ONLY-NEXT: movq %r8, %r14 +; AVX512F-ONLY-NEXT: shlq $45, %r14 +; AVX512F-ONLY-NEXT: orq %r11, %r14 +; AVX512F-ONLY-NEXT: shlq $46, %r8 +; AVX512F-ONLY-NEXT: orq %r14, %r8 +; AVX512F-ONLY-NEXT: shlq $47, %r10 +; AVX512F-ONLY-NEXT: orq %r8, %r10 +; AVX512F-ONLY-NEXT: orq %rbx, %r10 +; AVX512F-ONLY-NEXT: shrq $32, %r10 +; AVX512F-ONLY-NEXT: movw %r10w, 12(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k4, %r8d +; AVX512F-ONLY-NEXT: movzbl %r8b, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: leaq (%r10,%r10,2), %r8 +; AVX512F-ONLY-NEXT: leaq (%r8,%r10,4), %r8 +; AVX512F-ONLY-NEXT: leaq (%r8,%r10,8), %r8 +; AVX512F-ONLY-NEXT: movq %r10, %r11 +; AVX512F-ONLY-NEXT: shlq $4, %r11 +; AVX512F-ONLY-NEXT: orq %r8, %r11 +; AVX512F-ONLY-NEXT: movq %r10, %r8 +; AVX512F-ONLY-NEXT: shlq $5, %r8 +; AVX512F-ONLY-NEXT: orq %r11, %r8 +; AVX512F-ONLY-NEXT: shlq $6, %r10 +; AVX512F-ONLY-NEXT: movzbl %r9b, %r9d +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %r11 +; AVX512F-ONLY-NEXT: shlq $7, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $8, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: movq %r9, %r11 +; AVX512F-ONLY-NEXT: shlq $9, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $10, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: movq %r9, %r11 +; AVX512F-ONLY-NEXT: shlq $11, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $12, %r10 +; AVX512F-ONLY-NEXT: orq %r11, %r10 +; AVX512F-ONLY-NEXT: shlq $13, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movzbl %dil, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r10 +; AVX512F-ONLY-NEXT: shlq $14, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %rdi, %r9 +; AVX512F-ONLY-NEXT: shlq $15, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movq %rdi, %r10 +; AVX512F-ONLY-NEXT: shlq $16, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %rdi, %r9 +; AVX512F-ONLY-NEXT: shlq $17, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movq %rdi, %r10 +; AVX512F-ONLY-NEXT: shlq $18, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %rdi, %r9 +; AVX512F-ONLY-NEXT: shlq $19, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: shlq $20, %rdi +; AVX512F-ONLY-NEXT: orq %r9, %rdi +; AVX512F-ONLY-NEXT: movzbl %dl, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movq %rdx, %r9 +; AVX512F-ONLY-NEXT: shlq $21, %r9 +; AVX512F-ONLY-NEXT: orq %rdi, %r9 +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $22, %rdi +; AVX512F-ONLY-NEXT: orq %r9, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %r9 +; AVX512F-ONLY-NEXT: shlq $23, %r9 +; AVX512F-ONLY-NEXT: orq %rdi, %r9 +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $24, %rdi +; AVX512F-ONLY-NEXT: orq %r9, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %r9 +; AVX512F-ONLY-NEXT: shlq $25, %r9 +; AVX512F-ONLY-NEXT: orq %rdi, %r9 +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $26, %rdi +; AVX512F-ONLY-NEXT: orq %r9, %rdi +; AVX512F-ONLY-NEXT: shlq $27, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movzbl %cl, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r9 +; AVX512F-ONLY-NEXT: shlq $28, %r9 +; AVX512F-ONLY-NEXT: orq %rdx, %r9 +; AVX512F-ONLY-NEXT: movq %rdi, %rdx +; AVX512F-ONLY-NEXT: shlq $29, %rdx +; AVX512F-ONLY-NEXT: orq %r9, %rdx +; AVX512F-ONLY-NEXT: movq %rdi, %r9 +; AVX512F-ONLY-NEXT: shlq $30, %r9 +; AVX512F-ONLY-NEXT: orq %rdx, %r9 +; AVX512F-ONLY-NEXT: shlq $31, %rdi +; AVX512F-ONLY-NEXT: orq %r9, %rdi +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $32, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $33, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $34, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: kmovw %k3, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $35, %rdi +; AVX512F-ONLY-NEXT: orq %rcx, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %rcx +; AVX512F-ONLY-NEXT: shlq $36, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $37, %rdi +; AVX512F-ONLY-NEXT: orq %rcx, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %rcx +; AVX512F-ONLY-NEXT: shlq $38, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $39, %rdi +; AVX512F-ONLY-NEXT: orq %rcx, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %rcx +; AVX512F-ONLY-NEXT: shlq $40, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: shlq $41, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: kmovw %k2, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $42, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $43, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $44, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $45, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $46, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $47, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: shlq $48, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: kmovw %k1, %edx +; AVX512F-ONLY-NEXT: andl $1, %edx +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $49, %rdi +; AVX512F-ONLY-NEXT: orq %rcx, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %rcx +; AVX512F-ONLY-NEXT: shlq $50, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $51, %rdi +; AVX512F-ONLY-NEXT: orq %rcx, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %rcx +; AVX512F-ONLY-NEXT: shlq $52, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: movq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $53, %rdi +; AVX512F-ONLY-NEXT: orq %rcx, %rdi +; AVX512F-ONLY-NEXT: movq %rdx, %rcx +; AVX512F-ONLY-NEXT: shlq $54, %rcx +; AVX512F-ONLY-NEXT: orq %rdi, %rcx +; AVX512F-ONLY-NEXT: shlq $55, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: kmovw %k0, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $56, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $57, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $58, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $59, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rdi +; AVX512F-ONLY-NEXT: shlq $60, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $61, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: shlq $62, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: shlq $63, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %rax, (%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r12 +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: popq %r15 +; AVX512F-ONLY-NEXT: popq %rbp +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor7_vf16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovw (%rdi), %k4 +; AVX512DQ-NEXT: kshiftrw $8, %k4, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k4, %k1 +; AVX512DQ-NEXT: kshiftrw $6, %k4, %k2 +; AVX512DQ-NEXT: kshiftrw $5, %k4, %k3 +; AVX512DQ-NEXT: kshiftrw $4, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %ecx +; AVX512DQ-NEXT: kshiftrw $3, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %edx +; AVX512DQ-NEXT: kshiftrw $2, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %edi +; AVX512DQ-NEXT: kshiftrw $1, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %r9d +; AVX512DQ-NEXT: kshiftrw $15, %k4, %k5 +; AVX512DQ-NEXT: kmovw %k5, %r8d +; AVX512DQ-NEXT: kshiftrw $14, %k4, %k5 +; AVX512DQ-NEXT: kshiftrw $13, %k4, %k6 +; AVX512DQ-NEXT: kmovw %k6, %r10d +; AVX512DQ-NEXT: kshiftrw $12, %k4, %k6 +; AVX512DQ-NEXT: kmovw %k6, %ebx +; AVX512DQ-NEXT: kshiftrw $11, %k4, %k6 +; AVX512DQ-NEXT: kmovw %k6, %ebp +; AVX512DQ-NEXT: kshiftrw $10, %k4, %k6 +; AVX512DQ-NEXT: kmovw %k6, %r14d +; AVX512DQ-NEXT: kshiftrw $9, %k4, %k6 +; AVX512DQ-NEXT: kmovw %k6, %eax +; AVX512DQ-NEXT: movzbl %al, %eax +; AVX512DQ-NEXT: movl %eax, %r11d +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: leaq (%r11,%r11,2), %r15 +; AVX512DQ-NEXT: leaq (%r15,%r11,4), %r15 +; AVX512DQ-NEXT: leaq (%r15,%r11,8), %r15 +; AVX512DQ-NEXT: movq %r11, %r12 +; AVX512DQ-NEXT: shlq $4, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: shlq $5, %r11 +; AVX512DQ-NEXT: orq %r12, %r11 +; AVX512DQ-NEXT: movzbl %r14b, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r14 +; AVX512DQ-NEXT: shlq $6, %r14 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $7, %r12 +; AVX512DQ-NEXT: orq %r14, %r12 +; AVX512DQ-NEXT: movq %r15, %r14 +; AVX512DQ-NEXT: shlq $8, %r14 +; AVX512DQ-NEXT: orq %r12, %r14 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $9, %r12 +; AVX512DQ-NEXT: orq %r14, %r12 +; AVX512DQ-NEXT: movq %r15, %r14 +; AVX512DQ-NEXT: shlq $10, %r14 +; AVX512DQ-NEXT: orq %r12, %r14 +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $11, %r12 +; AVX512DQ-NEXT: orq %r14, %r12 +; AVX512DQ-NEXT: shlq $12, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movzbl %bpl, %r14d +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $13, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movq %r14, %r15 +; AVX512DQ-NEXT: shlq $14, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $15, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movq %r14, %r15 +; AVX512DQ-NEXT: shlq $16, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $17, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movq %r14, %r15 +; AVX512DQ-NEXT: shlq $18, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: shlq $19, %r14 +; AVX512DQ-NEXT: orq %r15, %r14 +; AVX512DQ-NEXT: movzbl %bl, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rbx +; AVX512DQ-NEXT: shlq $20, %rbx +; AVX512DQ-NEXT: orq %r14, %rbx +; AVX512DQ-NEXT: movq %r15, %r14 +; AVX512DQ-NEXT: shlq $21, %r14 +; AVX512DQ-NEXT: orq %rbx, %r14 +; AVX512DQ-NEXT: movq %r15, %rbx +; AVX512DQ-NEXT: shlq $22, %rbx +; AVX512DQ-NEXT: orq %r14, %rbx +; AVX512DQ-NEXT: movq %r15, %r14 +; AVX512DQ-NEXT: shlq $23, %r14 +; AVX512DQ-NEXT: orq %rbx, %r14 +; AVX512DQ-NEXT: movq %r15, %rbx +; AVX512DQ-NEXT: shlq $24, %rbx +; AVX512DQ-NEXT: orq %r14, %rbx +; AVX512DQ-NEXT: movq %r15, %r14 +; AVX512DQ-NEXT: shlq $25, %r14 +; AVX512DQ-NEXT: orq %rbx, %r14 +; AVX512DQ-NEXT: shlq $26, %r15 +; AVX512DQ-NEXT: orq %r14, %r15 +; AVX512DQ-NEXT: movzbl %r10b, %ebx +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: movq %rbx, %r14 +; AVX512DQ-NEXT: shlq $27, %r14 +; AVX512DQ-NEXT: orq %r15, %r14 +; AVX512DQ-NEXT: movq %rbx, %r15 +; AVX512DQ-NEXT: shlq $28, %r15 +; AVX512DQ-NEXT: orq %r14, %r15 +; AVX512DQ-NEXT: movq %rbx, %r14 +; AVX512DQ-NEXT: shlq $29, %r14 +; AVX512DQ-NEXT: orq %r15, %r14 +; AVX512DQ-NEXT: movq %rbx, %r15 +; AVX512DQ-NEXT: shlq $30, %r15 +; AVX512DQ-NEXT: orq %r14, %r15 +; AVX512DQ-NEXT: shlq $31, %rbx +; AVX512DQ-NEXT: orq %r15, %rbx +; AVX512DQ-NEXT: orq %r11, %rbx +; AVX512DQ-NEXT: movl %ebx, 8(%rsi) +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %r11 +; AVX512DQ-NEXT: shlq $32, %r11 +; AVX512DQ-NEXT: shlq $33, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: kmovw %k5, %r11d +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: movq %r11, %r14 +; AVX512DQ-NEXT: shlq $34, %r14 +; AVX512DQ-NEXT: orq %r10, %r14 +; AVX512DQ-NEXT: movq %r11, %r10 +; AVX512DQ-NEXT: shlq $35, %r10 +; AVX512DQ-NEXT: orq %r14, %r10 +; AVX512DQ-NEXT: movq %r11, %r14 +; AVX512DQ-NEXT: shlq $36, %r14 +; AVX512DQ-NEXT: orq %r10, %r14 +; AVX512DQ-NEXT: movq %r11, %r10 +; AVX512DQ-NEXT: shlq $37, %r10 +; AVX512DQ-NEXT: orq %r14, %r10 +; AVX512DQ-NEXT: movq %r11, %r14 +; AVX512DQ-NEXT: shlq $38, %r14 +; AVX512DQ-NEXT: orq %r10, %r14 +; AVX512DQ-NEXT: movq %r11, %r10 +; AVX512DQ-NEXT: shlq $39, %r10 +; AVX512DQ-NEXT: orq %r14, %r10 +; AVX512DQ-NEXT: shlq $40, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movzbl %r8b, %r10d +; AVX512DQ-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %r14 +; AVX512DQ-NEXT: shlq $41, %r14 +; AVX512DQ-NEXT: orq %r11, %r14 +; AVX512DQ-NEXT: movq %r8, %r11 +; AVX512DQ-NEXT: shlq $42, %r11 +; AVX512DQ-NEXT: orq %r14, %r11 +; AVX512DQ-NEXT: movq %r8, %r14 +; AVX512DQ-NEXT: shlq $43, %r14 +; AVX512DQ-NEXT: orq %r11, %r14 +; AVX512DQ-NEXT: movq %r8, %r11 +; AVX512DQ-NEXT: shlq $44, %r11 +; AVX512DQ-NEXT: orq %r14, %r11 +; AVX512DQ-NEXT: movq %r8, %r14 +; AVX512DQ-NEXT: shlq $45, %r14 +; AVX512DQ-NEXT: orq %r11, %r14 +; AVX512DQ-NEXT: shlq $46, %r8 +; AVX512DQ-NEXT: orq %r14, %r8 +; AVX512DQ-NEXT: shlq $47, %r10 +; AVX512DQ-NEXT: orq %r8, %r10 +; AVX512DQ-NEXT: orq %rbx, %r10 +; AVX512DQ-NEXT: shrq $32, %r10 +; AVX512DQ-NEXT: movw %r10w, 12(%rsi) +; AVX512DQ-NEXT: kmovw %k4, %r8d +; AVX512DQ-NEXT: movzbl %r8b, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: leaq (%r10,%r10,2), %r8 +; AVX512DQ-NEXT: leaq (%r8,%r10,4), %r8 +; AVX512DQ-NEXT: leaq (%r8,%r10,8), %r8 +; AVX512DQ-NEXT: movq %r10, %r11 +; AVX512DQ-NEXT: shlq $4, %r11 +; AVX512DQ-NEXT: orq %r8, %r11 +; AVX512DQ-NEXT: movq %r10, %r8 +; AVX512DQ-NEXT: shlq $5, %r8 +; AVX512DQ-NEXT: orq %r11, %r8 +; AVX512DQ-NEXT: shlq $6, %r10 +; AVX512DQ-NEXT: movzbl %r9b, %r9d +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %r11 +; AVX512DQ-NEXT: shlq $7, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movq %r9, %r10 +; AVX512DQ-NEXT: shlq $8, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: movq %r9, %r11 +; AVX512DQ-NEXT: shlq $9, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movq %r9, %r10 +; AVX512DQ-NEXT: shlq $10, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: movq %r9, %r11 +; AVX512DQ-NEXT: shlq $11, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movq %r9, %r10 +; AVX512DQ-NEXT: shlq $12, %r10 +; AVX512DQ-NEXT: orq %r11, %r10 +; AVX512DQ-NEXT: shlq $13, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movzbl %dil, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r10 +; AVX512DQ-NEXT: shlq $14, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %rdi, %r9 +; AVX512DQ-NEXT: shlq $15, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movq %rdi, %r10 +; AVX512DQ-NEXT: shlq $16, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %rdi, %r9 +; AVX512DQ-NEXT: shlq $17, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movq %rdi, %r10 +; AVX512DQ-NEXT: shlq $18, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %rdi, %r9 +; AVX512DQ-NEXT: shlq $19, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: shlq $20, %rdi +; AVX512DQ-NEXT: orq %r9, %rdi +; AVX512DQ-NEXT: movzbl %dl, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movq %rdx, %r9 +; AVX512DQ-NEXT: shlq $21, %r9 +; AVX512DQ-NEXT: orq %rdi, %r9 +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $22, %rdi +; AVX512DQ-NEXT: orq %r9, %rdi +; AVX512DQ-NEXT: movq %rdx, %r9 +; AVX512DQ-NEXT: shlq $23, %r9 +; AVX512DQ-NEXT: orq %rdi, %r9 +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $24, %rdi +; AVX512DQ-NEXT: orq %r9, %rdi +; AVX512DQ-NEXT: movq %rdx, %r9 +; AVX512DQ-NEXT: shlq $25, %r9 +; AVX512DQ-NEXT: orq %rdi, %r9 +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $26, %rdi +; AVX512DQ-NEXT: orq %r9, %rdi +; AVX512DQ-NEXT: shlq $27, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movzbl %cl, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r9 +; AVX512DQ-NEXT: shlq $28, %r9 +; AVX512DQ-NEXT: orq %rdx, %r9 +; AVX512DQ-NEXT: movq %rdi, %rdx +; AVX512DQ-NEXT: shlq $29, %rdx +; AVX512DQ-NEXT: orq %r9, %rdx +; AVX512DQ-NEXT: movq %rdi, %r9 +; AVX512DQ-NEXT: shlq $30, %r9 +; AVX512DQ-NEXT: orq %rdx, %r9 +; AVX512DQ-NEXT: shlq $31, %rdi +; AVX512DQ-NEXT: orq %r9, %rdi +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $32, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $33, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: shlq $34, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: kmovw %k3, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $35, %rdi +; AVX512DQ-NEXT: orq %rcx, %rdi +; AVX512DQ-NEXT: movq %rdx, %rcx +; AVX512DQ-NEXT: shlq $36, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $37, %rdi +; AVX512DQ-NEXT: orq %rcx, %rdi +; AVX512DQ-NEXT: movq %rdx, %rcx +; AVX512DQ-NEXT: shlq $38, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $39, %rdi +; AVX512DQ-NEXT: orq %rcx, %rdi +; AVX512DQ-NEXT: movq %rdx, %rcx +; AVX512DQ-NEXT: shlq $40, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: shlq $41, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: kmovw %k2, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $42, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $43, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $44, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $45, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $46, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $47, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: shlq $48, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: kmovw %k1, %edx +; AVX512DQ-NEXT: andl $1, %edx +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $49, %rdi +; AVX512DQ-NEXT: orq %rcx, %rdi +; AVX512DQ-NEXT: movq %rdx, %rcx +; AVX512DQ-NEXT: shlq $50, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $51, %rdi +; AVX512DQ-NEXT: orq %rcx, %rdi +; AVX512DQ-NEXT: movq %rdx, %rcx +; AVX512DQ-NEXT: shlq $52, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: movq %rdx, %rdi +; AVX512DQ-NEXT: shlq $53, %rdi +; AVX512DQ-NEXT: orq %rcx, %rdi +; AVX512DQ-NEXT: movq %rdx, %rcx +; AVX512DQ-NEXT: shlq $54, %rcx +; AVX512DQ-NEXT: orq %rdi, %rcx +; AVX512DQ-NEXT: shlq $55, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $56, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $57, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $58, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $59, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rcx, %rdi +; AVX512DQ-NEXT: shlq $60, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $61, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: shlq $62, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: shlq $63, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %rax, (%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor7_vf16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovw (%rdi), %k4 +; AVX512BW-NEXT: kshiftrw $8, %k4, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k4, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k4, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k4, %k3 +; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %ecx +; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %edx +; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %edi +; AVX512BW-NEXT: kshiftrw $1, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %r9d +; AVX512BW-NEXT: kshiftrw $15, %k4, %k5 +; AVX512BW-NEXT: kmovd %k5, %r8d +; AVX512BW-NEXT: kshiftrw $14, %k4, %k5 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 +; AVX512BW-NEXT: kmovd %k6, %r10d +; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 +; AVX512BW-NEXT: kmovd %k6, %ebx +; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 +; AVX512BW-NEXT: kmovd %k6, %ebp +; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 +; AVX512BW-NEXT: kmovd %k6, %r14d +; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 +; AVX512BW-NEXT: kmovd %k6, %eax +; AVX512BW-NEXT: movzbl %al, %eax +; AVX512BW-NEXT: movl %eax, %r11d +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: leaq (%r11,%r11,2), %r15 +; AVX512BW-NEXT: leaq (%r15,%r11,4), %r15 +; AVX512BW-NEXT: leaq (%r15,%r11,8), %r15 +; AVX512BW-NEXT: movq %r11, %r12 +; AVX512BW-NEXT: shlq $4, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: shlq $5, %r11 +; AVX512BW-NEXT: orq %r12, %r11 +; AVX512BW-NEXT: movzbl %r14b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r14 +; AVX512BW-NEXT: shlq $6, %r14 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %r14, %r12 +; AVX512BW-NEXT: movq %r15, %r14 +; AVX512BW-NEXT: shlq $8, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $9, %r12 +; AVX512BW-NEXT: orq %r14, %r12 +; AVX512BW-NEXT: movq %r15, %r14 +; AVX512BW-NEXT: shlq $10, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $11, %r12 +; AVX512BW-NEXT: orq %r14, %r12 +; AVX512BW-NEXT: shlq $12, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movzbl %bpl, %r14d +; AVX512BW-NEXT: andl $1, %r14d +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $13, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movq %r14, %r15 +; AVX512BW-NEXT: shlq $14, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $15, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movq %r14, %r15 +; AVX512BW-NEXT: shlq $16, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movq %r14, %r12 +; AVX512BW-NEXT: shlq $17, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movq %r14, %r15 +; AVX512BW-NEXT: shlq $18, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: shlq $19, %r14 +; AVX512BW-NEXT: orq %r15, %r14 +; AVX512BW-NEXT: movzbl %bl, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rbx +; AVX512BW-NEXT: shlq $20, %rbx +; AVX512BW-NEXT: orq %r14, %rbx +; AVX512BW-NEXT: movq %r15, %r14 +; AVX512BW-NEXT: shlq $21, %r14 +; AVX512BW-NEXT: orq %rbx, %r14 +; AVX512BW-NEXT: movq %r15, %rbx +; AVX512BW-NEXT: shlq $22, %rbx +; AVX512BW-NEXT: orq %r14, %rbx +; AVX512BW-NEXT: movq %r15, %r14 +; AVX512BW-NEXT: shlq $23, %r14 +; AVX512BW-NEXT: orq %rbx, %r14 +; AVX512BW-NEXT: movq %r15, %rbx +; AVX512BW-NEXT: shlq $24, %rbx +; AVX512BW-NEXT: orq %r14, %rbx +; AVX512BW-NEXT: movq %r15, %r14 +; AVX512BW-NEXT: shlq $25, %r14 +; AVX512BW-NEXT: orq %rbx, %r14 +; AVX512BW-NEXT: shlq $26, %r15 +; AVX512BW-NEXT: orq %r14, %r15 +; AVX512BW-NEXT: movzbl %r10b, %ebx +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %r14 +; AVX512BW-NEXT: shlq $27, %r14 +; AVX512BW-NEXT: orq %r15, %r14 +; AVX512BW-NEXT: movq %rbx, %r15 +; AVX512BW-NEXT: shlq $28, %r15 +; AVX512BW-NEXT: orq %r14, %r15 +; AVX512BW-NEXT: movq %rbx, %r14 +; AVX512BW-NEXT: shlq $29, %r14 +; AVX512BW-NEXT: orq %r15, %r14 +; AVX512BW-NEXT: movq %rbx, %r15 +; AVX512BW-NEXT: shlq $30, %r15 +; AVX512BW-NEXT: orq %r14, %r15 +; AVX512BW-NEXT: shlq $31, %rbx +; AVX512BW-NEXT: orq %r15, %rbx +; AVX512BW-NEXT: orq %r11, %rbx +; AVX512BW-NEXT: movl %ebx, 8(%rsi) +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %r11 +; AVX512BW-NEXT: shlq $32, %r11 +; AVX512BW-NEXT: shlq $33, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: kmovd %k5, %r11d +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %r14 +; AVX512BW-NEXT: shlq $34, %r14 +; AVX512BW-NEXT: orq %r10, %r14 +; AVX512BW-NEXT: movq %r11, %r10 +; AVX512BW-NEXT: shlq $35, %r10 +; AVX512BW-NEXT: orq %r14, %r10 +; AVX512BW-NEXT: movq %r11, %r14 +; AVX512BW-NEXT: shlq $36, %r14 +; AVX512BW-NEXT: orq %r10, %r14 +; AVX512BW-NEXT: movq %r11, %r10 +; AVX512BW-NEXT: shlq $37, %r10 +; AVX512BW-NEXT: orq %r14, %r10 +; AVX512BW-NEXT: movq %r11, %r14 +; AVX512BW-NEXT: shlq $38, %r14 +; AVX512BW-NEXT: orq %r10, %r14 +; AVX512BW-NEXT: movq %r11, %r10 +; AVX512BW-NEXT: shlq $39, %r10 +; AVX512BW-NEXT: orq %r14, %r10 +; AVX512BW-NEXT: shlq $40, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movzbl %r8b, %r10d +; AVX512BW-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: movq %r8, %r14 +; AVX512BW-NEXT: shlq $41, %r14 +; AVX512BW-NEXT: orq %r11, %r14 +; AVX512BW-NEXT: movq %r8, %r11 +; AVX512BW-NEXT: shlq $42, %r11 +; AVX512BW-NEXT: orq %r14, %r11 +; AVX512BW-NEXT: movq %r8, %r14 +; AVX512BW-NEXT: shlq $43, %r14 +; AVX512BW-NEXT: orq %r11, %r14 +; AVX512BW-NEXT: movq %r8, %r11 +; AVX512BW-NEXT: shlq $44, %r11 +; AVX512BW-NEXT: orq %r14, %r11 +; AVX512BW-NEXT: movq %r8, %r14 +; AVX512BW-NEXT: shlq $45, %r14 +; AVX512BW-NEXT: orq %r11, %r14 +; AVX512BW-NEXT: shlq $46, %r8 +; AVX512BW-NEXT: orq %r14, %r8 +; AVX512BW-NEXT: shlq $47, %r10 +; AVX512BW-NEXT: orq %r8, %r10 +; AVX512BW-NEXT: orq %rbx, %r10 +; AVX512BW-NEXT: shrq $32, %r10 +; AVX512BW-NEXT: movw %r10w, 12(%rsi) +; AVX512BW-NEXT: kmovd %k4, %r8d +; AVX512BW-NEXT: movzbl %r8b, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: leaq (%r10,%r10,2), %r8 +; AVX512BW-NEXT: leaq (%r8,%r10,4), %r8 +; AVX512BW-NEXT: leaq (%r8,%r10,8), %r8 +; AVX512BW-NEXT: movq %r10, %r11 +; AVX512BW-NEXT: shlq $4, %r11 +; AVX512BW-NEXT: orq %r8, %r11 +; AVX512BW-NEXT: movq %r10, %r8 +; AVX512BW-NEXT: shlq $5, %r8 +; AVX512BW-NEXT: orq %r11, %r8 +; AVX512BW-NEXT: shlq $6, %r10 +; AVX512BW-NEXT: movzbl %r9b, %r9d +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %r11 +; AVX512BW-NEXT: shlq $7, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: shlq $8, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: movq %r9, %r11 +; AVX512BW-NEXT: shlq $9, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: shlq $10, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: movq %r9, %r11 +; AVX512BW-NEXT: shlq $11, %r11 +; AVX512BW-NEXT: orq %r10, %r11 +; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: shlq $12, %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: shlq $13, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movzbl %dil, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r10 +; AVX512BW-NEXT: shlq $14, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %rdi, %r9 +; AVX512BW-NEXT: shlq $15, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movq %rdi, %r10 +; AVX512BW-NEXT: shlq $16, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %rdi, %r9 +; AVX512BW-NEXT: shlq $17, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: movq %rdi, %r10 +; AVX512BW-NEXT: shlq $18, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: movq %rdi, %r9 +; AVX512BW-NEXT: shlq $19, %r9 +; AVX512BW-NEXT: orq %r10, %r9 +; AVX512BW-NEXT: shlq $20, %rdi +; AVX512BW-NEXT: orq %r9, %rdi +; AVX512BW-NEXT: movzbl %dl, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %r9 +; AVX512BW-NEXT: shlq $21, %r9 +; AVX512BW-NEXT: orq %rdi, %r9 +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $22, %rdi +; AVX512BW-NEXT: orq %r9, %rdi +; AVX512BW-NEXT: movq %rdx, %r9 +; AVX512BW-NEXT: shlq $23, %r9 +; AVX512BW-NEXT: orq %rdi, %r9 +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $24, %rdi +; AVX512BW-NEXT: orq %r9, %rdi +; AVX512BW-NEXT: movq %rdx, %r9 +; AVX512BW-NEXT: shlq $25, %r9 +; AVX512BW-NEXT: orq %rdi, %r9 +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $26, %rdi +; AVX512BW-NEXT: orq %r9, %rdi +; AVX512BW-NEXT: shlq $27, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movzbl %cl, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r9 +; AVX512BW-NEXT: shlq $28, %r9 +; AVX512BW-NEXT: orq %rdx, %r9 +; AVX512BW-NEXT: movq %rdi, %rdx +; AVX512BW-NEXT: shlq $29, %rdx +; AVX512BW-NEXT: orq %r9, %rdx +; AVX512BW-NEXT: movq %rdi, %r9 +; AVX512BW-NEXT: shlq $30, %r9 +; AVX512BW-NEXT: orq %rdx, %r9 +; AVX512BW-NEXT: shlq $31, %rdi +; AVX512BW-NEXT: orq %r9, %rdi +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $32, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $33, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: shlq $34, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: kmovd %k3, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $35, %rdi +; AVX512BW-NEXT: orq %rcx, %rdi +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: shlq $36, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $37, %rdi +; AVX512BW-NEXT: orq %rcx, %rdi +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: shlq $38, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $39, %rdi +; AVX512BW-NEXT: orq %rcx, %rdi +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: shlq $40, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: shlq $41, %rdx +; AVX512BW-NEXT: orq %rcx, %rdx +; AVX512BW-NEXT: kmovd %k2, %ecx +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $42, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $43, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $44, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $45, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $46, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $47, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: shlq $48, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $49, %rdi +; AVX512BW-NEXT: orq %rcx, %rdi +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: shlq $50, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $51, %rdi +; AVX512BW-NEXT: orq %rcx, %rdi +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: shlq $52, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $53, %rdi +; AVX512BW-NEXT: orq %rcx, %rdi +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: shlq $54, %rcx +; AVX512BW-NEXT: orq %rdi, %rcx +; AVX512BW-NEXT: shlq $55, %rdx +; AVX512BW-NEXT: orq %rcx, %rdx +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $56, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $57, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $58, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $59, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movq %rcx, %rdi +; AVX512BW-NEXT: shlq $60, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $61, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: shlq $62, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: shlq $63, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rax, (%rsi) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: retq + %src.vec = load <16 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <112 x i32> + store <112 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor7_vf32(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor7_vf32: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbp +; AVX512F-ONLY-NEXT: pushq %r15 +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %r13 +; AVX512F-ONLY-NEXT: pushq %r12 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k0 +; AVX512F-ONLY-NEXT: kshiftrw $15, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-ONLY-NEXT: kshiftrw $13, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $12, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %edx +; AVX512F-ONLY-NEXT: kshiftrw $11, %k1, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %edi +; AVX512F-ONLY-NEXT: kshiftrw $11, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $2, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k0, %k2 +; AVX512F-ONLY-NEXT: kmovw %k2, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k0, %k2 +; AVX512F-ONLY-NEXT: movl %r8d, %r9d +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: leal (%r9,%r9,2), %eax +; AVX512F-ONLY-NEXT: leal (%rax,%r9,4), %eax +; AVX512F-ONLY-NEXT: leal (%rax,%r9,8), %eax +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movl %r15d, %r12d +; AVX512F-ONLY-NEXT: shll $4, %r12d +; AVX512F-ONLY-NEXT: orl %eax, %r12d +; AVX512F-ONLY-NEXT: movl %r15d, %ebx +; AVX512F-ONLY-NEXT: shll $5, %ebx +; AVX512F-ONLY-NEXT: orl %r12d, %ebx +; AVX512F-ONLY-NEXT: movl %r15d, %eax +; AVX512F-ONLY-NEXT: shll $6, %eax +; AVX512F-ONLY-NEXT: movl %r15d, %r12d +; AVX512F-ONLY-NEXT: shll $7, %r12d +; AVX512F-ONLY-NEXT: orl %eax, %r12d +; AVX512F-ONLY-NEXT: movl %r15d, %eax +; AVX512F-ONLY-NEXT: shll $8, %eax +; AVX512F-ONLY-NEXT: orl %r12d, %eax +; AVX512F-ONLY-NEXT: movl %r15d, %r13d +; AVX512F-ONLY-NEXT: shll $9, %r13d +; AVX512F-ONLY-NEXT: orl %eax, %r13d +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $15, %k0, %k2 +; AVX512F-ONLY-NEXT: shll $10, %r15d +; AVX512F-ONLY-NEXT: orl %r13d, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movl %r10d, %eax +; AVX512F-ONLY-NEXT: shll $11, %eax +; AVX512F-ONLY-NEXT: orl %r15d, %eax +; AVX512F-ONLY-NEXT: movl %r10d, %r15d +; AVX512F-ONLY-NEXT: shll $12, %r15d +; AVX512F-ONLY-NEXT: orl %eax, %r15d +; AVX512F-ONLY-NEXT: movl %r10d, %eax +; AVX512F-ONLY-NEXT: shll $13, %eax +; AVX512F-ONLY-NEXT: orl %r15d, %eax +; AVX512F-ONLY-NEXT: movl %r10d, %r15d +; AVX512F-ONLY-NEXT: shll $14, %r15d +; AVX512F-ONLY-NEXT: orl %eax, %r15d +; AVX512F-ONLY-NEXT: movl %r10d, %eax +; AVX512F-ONLY-NEXT: shll $15, %eax +; AVX512F-ONLY-NEXT: orl %r15d, %eax +; AVX512F-ONLY-NEXT: movl %r10d, %r15d +; AVX512F-ONLY-NEXT: shll $16, %r15d +; AVX512F-ONLY-NEXT: orl %eax, %r15d +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: kshiftrw $4, %k1, %k2 +; AVX512F-ONLY-NEXT: shll $17, %r10d +; AVX512F-ONLY-NEXT: orl %r15d, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movl %r12d, %r15d +; AVX512F-ONLY-NEXT: shll $18, %r15d +; AVX512F-ONLY-NEXT: orl %r10d, %r15d +; AVX512F-ONLY-NEXT: movl %r12d, %r10d +; AVX512F-ONLY-NEXT: shll $19, %r10d +; AVX512F-ONLY-NEXT: orl %r15d, %r10d +; AVX512F-ONLY-NEXT: movl %r12d, %r15d +; AVX512F-ONLY-NEXT: shll $20, %r15d +; AVX512F-ONLY-NEXT: orl %r10d, %r15d +; AVX512F-ONLY-NEXT: movl %r12d, %r10d +; AVX512F-ONLY-NEXT: shll $21, %r10d +; AVX512F-ONLY-NEXT: orl %r15d, %r10d +; AVX512F-ONLY-NEXT: movl %r12d, %r15d +; AVX512F-ONLY-NEXT: shll $22, %r15d +; AVX512F-ONLY-NEXT: orl %r10d, %r15d +; AVX512F-ONLY-NEXT: movl %r12d, %r10d +; AVX512F-ONLY-NEXT: shll $23, %r10d +; AVX512F-ONLY-NEXT: orl %r15d, %r10d +; AVX512F-ONLY-NEXT: shll $24, %r12d +; AVX512F-ONLY-NEXT: orl %r10d, %r12d +; AVX512F-ONLY-NEXT: movl %eax, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movl %r15d, %r10d +; AVX512F-ONLY-NEXT: shll $25, %r10d +; AVX512F-ONLY-NEXT: orl %r12d, %r10d +; AVX512F-ONLY-NEXT: movl %r15d, %r12d +; AVX512F-ONLY-NEXT: shll $26, %r12d +; AVX512F-ONLY-NEXT: orl %r10d, %r12d +; AVX512F-ONLY-NEXT: movl %r15d, %r10d +; AVX512F-ONLY-NEXT: shll $27, %r10d +; AVX512F-ONLY-NEXT: orl %r12d, %r10d +; AVX512F-ONLY-NEXT: movl %r15d, %r12d +; AVX512F-ONLY-NEXT: shll $28, %r12d +; AVX512F-ONLY-NEXT: orl %r10d, %r12d +; AVX512F-ONLY-NEXT: movl %r15d, %r13d +; AVX512F-ONLY-NEXT: shll $29, %r13d +; AVX512F-ONLY-NEXT: orl %r12d, %r13d +; AVX512F-ONLY-NEXT: kmovw %k2, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k1, %k2 +; AVX512F-ONLY-NEXT: shll $30, %r15d +; AVX512F-ONLY-NEXT: orl %r13d, %r15d +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k1, %k2 +; AVX512F-ONLY-NEXT: shll $31, %eax +; AVX512F-ONLY-NEXT: orl %r15d, %eax +; AVX512F-ONLY-NEXT: orl %ebx, %eax +; AVX512F-ONLY-NEXT: movl %eax, 24(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k1, %eax +; AVX512F-ONLY-NEXT: movzbl %al, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leaq (%rax,%rax,2), %rbx +; AVX512F-ONLY-NEXT: leaq (%rbx,%rax,4), %rbx +; AVX512F-ONLY-NEXT: leaq (%rbx,%rax,8), %rbx +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $4, %r15 +; AVX512F-ONLY-NEXT: orq %rbx, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $5, %rbx +; AVX512F-ONLY-NEXT: orq %r15, %rbx +; AVX512F-ONLY-NEXT: shlq $6, %rax +; AVX512F-ONLY-NEXT: movzbl %r12b, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $7, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $9, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $11, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $3, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $13, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movzbl %r12b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $15, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $17, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $18, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $19, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: kshiftrw $9, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $20, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $21, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $22, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $23, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $24, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $25, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $26, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $27, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movzbl %r10b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $28, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $29, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $30, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $32, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $34, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $36, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $37, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $38, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $39, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $40, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $7, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $41, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $43, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $45, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r13 +; AVX512F-ONLY-NEXT: shlq $47, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $48, %r10 +; AVX512F-ONLY-NEXT: orq %r13, %r10 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $49, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $50, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $51, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $52, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $53, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $54, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: kshiftrw $4, %k0, %k2 +; AVX512F-ONLY-NEXT: shlq $55, %r12 +; AVX512F-ONLY-NEXT: orq %r10, %r12 +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $56, %r10 +; AVX512F-ONLY-NEXT: orq %r12, %r10 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $57, %r12 +; AVX512F-ONLY-NEXT: orq %r10, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $58, %r10 +; AVX512F-ONLY-NEXT: orq %r12, %r10 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $59, %r12 +; AVX512F-ONLY-NEXT: orq %r10, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $60, %r10 +; AVX512F-ONLY-NEXT: orq %r12, %r10 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $62, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %r15b, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %r15 +; AVX512F-ONLY-NEXT: shlq $63, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: orq %rbx, %r15 +; AVX512F-ONLY-NEXT: movq %r15, (%rsi) +; AVX512F-ONLY-NEXT: movzbl %r11b, %r11d +; AVX512F-ONLY-NEXT: movl %r11d, %ebx +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: leaq (%rbx,%rbx,2), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%rbx,4), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%rbx,8), %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $5, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $6, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $7, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $9, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $5, %k0, %k2 +; AVX512F-ONLY-NEXT: shlq $11, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $13, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $15, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $17, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %eax +; AVX512F-ONLY-NEXT: kshiftrw $10, %k1, %k2 +; AVX512F-ONLY-NEXT: shlq $18, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $19, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $20, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $21, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $22, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $23, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $24, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $25, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movzbl %r14b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $26, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $27, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $28, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $29, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $30, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k2, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $7, %k0, %k2 +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k0, %k2 +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: shlq $32, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $34, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $36, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $37, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $38, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k0, %k2 +; AVX512F-ONLY-NEXT: shlq $39, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $41, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $43, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r14, %r13 +; AVX512F-ONLY-NEXT: shlq $45, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k0, %k2 +; AVX512F-ONLY-NEXT: shlq $46, %r14 +; AVX512F-ONLY-NEXT: orq %r13, %r14 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $47, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $48, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $49, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $50, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $51, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $52, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k1, %k1 +; AVX512F-ONLY-NEXT: shlq $53, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $55, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $56, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $57, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $58, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r14, %r12 +; AVX512F-ONLY-NEXT: shlq $59, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k1, %r13d +; AVX512F-ONLY-NEXT: shlq $60, %r14 +; AVX512F-ONLY-NEXT: orq %r12, %r14 +; AVX512F-ONLY-NEXT: movq %r9, %r12 +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %r14, %r12 +; AVX512F-ONLY-NEXT: kmovw %k0, %r14d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k0 +; AVX512F-ONLY-NEXT: shlq $62, %r9 +; AVX512F-ONLY-NEXT: orq %r12, %r9 +; AVX512F-ONLY-NEXT: kmovw %k0, %r12d +; AVX512F-ONLY-NEXT: movzbl %r8b, %eax +; AVX512F-ONLY-NEXT: shlq $63, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %rax, 16(%rsi) +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: leaq (%r10,%r10,2), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%r10,4), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%r10,8), %rax +; AVX512F-ONLY-NEXT: movq %r10, %r8 +; AVX512F-ONLY-NEXT: shlq $4, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $5, %r10 +; AVX512F-ONLY-NEXT: orq %r8, %r10 +; AVX512F-ONLY-NEXT: movzbl %bpl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $6, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $7, %r9 +; AVX512F-ONLY-NEXT: orq %r8, %r9 +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $8, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $9, %r9 +; AVX512F-ONLY-NEXT: orq %r8, %r9 +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $10, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $11, %r9 +; AVX512F-ONLY-NEXT: orq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movzbl %dil, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $13, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $15, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %rdi, %r8 +; AVX512F-ONLY-NEXT: shlq $17, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $18, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $19, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: movzbl %dl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $20, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $21, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $22, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $23, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $24, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $25, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $26, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: movzbl %cl, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $27, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $28, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $29, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $30, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $31, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $35, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $37, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $39, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $40, %r13 +; AVX512F-ONLY-NEXT: orq %rcx, %r13 +; AVX512F-ONLY-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $41, %rcx +; AVX512F-ONLY-NEXT: orq %r13, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $42, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $43, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $44, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $45, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $46, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $47, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: andl $1, %r14d +; AVX512F-ONLY-NEXT: movq %r14, %rcx +; AVX512F-ONLY-NEXT: shlq $48, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $49, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r14, %rcx +; AVX512F-ONLY-NEXT: shlq $50, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $51, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r14, %rcx +; AVX512F-ONLY-NEXT: shlq $52, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r14, %rax +; AVX512F-ONLY-NEXT: shlq $53, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $54, %r14 +; AVX512F-ONLY-NEXT: orq %rax, %r14 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %r14, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $56, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $58, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $59, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $60, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %rcx, %r12 +; AVX512F-ONLY-NEXT: shlq $62, %rbx +; AVX512F-ONLY-NEXT: orq %r12, %rbx +; AVX512F-ONLY-NEXT: shlq $63, %r11 +; AVX512F-ONLY-NEXT: orq %rbx, %r11 +; AVX512F-ONLY-NEXT: orq %r10, %r11 +; AVX512F-ONLY-NEXT: movq %r11, 8(%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r12 +; AVX512F-ONLY-NEXT: popq %r13 +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: popq %r15 +; AVX512F-ONLY-NEXT: popq %rbp +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor7_vf32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %r13 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovw (%rdi), %k1 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQ-NEXT: kshiftrw $13, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %ecx +; AVX512DQ-NEXT: kshiftrw $12, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %edx +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k2 +; AVX512DQ-NEXT: kmovw %k2, %edi +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r8d +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r14d +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r11d +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, %r10d +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k2 +; AVX512DQ-NEXT: movl %r8d, %r9d +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: leal (%r9,%r9,2), %eax +; AVX512DQ-NEXT: leal (%rax,%r9,4), %eax +; AVX512DQ-NEXT: leal (%rax,%r9,8), %eax +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movl %r15d, %r12d +; AVX512DQ-NEXT: shll $4, %r12d +; AVX512DQ-NEXT: orl %eax, %r12d +; AVX512DQ-NEXT: movl %r15d, %ebx +; AVX512DQ-NEXT: shll $5, %ebx +; AVX512DQ-NEXT: orl %r12d, %ebx +; AVX512DQ-NEXT: movl %r15d, %eax +; AVX512DQ-NEXT: shll $6, %eax +; AVX512DQ-NEXT: movl %r15d, %r12d +; AVX512DQ-NEXT: shll $7, %r12d +; AVX512DQ-NEXT: orl %eax, %r12d +; AVX512DQ-NEXT: movl %r15d, %eax +; AVX512DQ-NEXT: shll $8, %eax +; AVX512DQ-NEXT: orl %r12d, %eax +; AVX512DQ-NEXT: movl %r15d, %r13d +; AVX512DQ-NEXT: shll $9, %r13d +; AVX512DQ-NEXT: orl %eax, %r13d +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k2 +; AVX512DQ-NEXT: shll $10, %r15d +; AVX512DQ-NEXT: orl %r13d, %r15d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movl %r10d, %eax +; AVX512DQ-NEXT: shll $11, %eax +; AVX512DQ-NEXT: orl %r15d, %eax +; AVX512DQ-NEXT: movl %r10d, %r15d +; AVX512DQ-NEXT: shll $12, %r15d +; AVX512DQ-NEXT: orl %eax, %r15d +; AVX512DQ-NEXT: movl %r10d, %eax +; AVX512DQ-NEXT: shll $13, %eax +; AVX512DQ-NEXT: orl %r15d, %eax +; AVX512DQ-NEXT: movl %r10d, %r15d +; AVX512DQ-NEXT: shll $14, %r15d +; AVX512DQ-NEXT: orl %eax, %r15d +; AVX512DQ-NEXT: movl %r10d, %eax +; AVX512DQ-NEXT: shll $15, %eax +; AVX512DQ-NEXT: orl %r15d, %eax +; AVX512DQ-NEXT: movl %r10d, %r15d +; AVX512DQ-NEXT: shll $16, %r15d +; AVX512DQ-NEXT: orl %eax, %r15d +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: kshiftrw $4, %k1, %k2 +; AVX512DQ-NEXT: shll $17, %r10d +; AVX512DQ-NEXT: orl %r15d, %r10d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movl %r12d, %r15d +; AVX512DQ-NEXT: shll $18, %r15d +; AVX512DQ-NEXT: orl %r10d, %r15d +; AVX512DQ-NEXT: movl %r12d, %r10d +; AVX512DQ-NEXT: shll $19, %r10d +; AVX512DQ-NEXT: orl %r15d, %r10d +; AVX512DQ-NEXT: movl %r12d, %r15d +; AVX512DQ-NEXT: shll $20, %r15d +; AVX512DQ-NEXT: orl %r10d, %r15d +; AVX512DQ-NEXT: movl %r12d, %r10d +; AVX512DQ-NEXT: shll $21, %r10d +; AVX512DQ-NEXT: orl %r15d, %r10d +; AVX512DQ-NEXT: movl %r12d, %r15d +; AVX512DQ-NEXT: shll $22, %r15d +; AVX512DQ-NEXT: orl %r10d, %r15d +; AVX512DQ-NEXT: movl %r12d, %r10d +; AVX512DQ-NEXT: shll $23, %r10d +; AVX512DQ-NEXT: orl %r15d, %r10d +; AVX512DQ-NEXT: shll $24, %r12d +; AVX512DQ-NEXT: orl %r10d, %r12d +; AVX512DQ-NEXT: movl %eax, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movl %r15d, %r10d +; AVX512DQ-NEXT: shll $25, %r10d +; AVX512DQ-NEXT: orl %r12d, %r10d +; AVX512DQ-NEXT: movl %r15d, %r12d +; AVX512DQ-NEXT: shll $26, %r12d +; AVX512DQ-NEXT: orl %r10d, %r12d +; AVX512DQ-NEXT: movl %r15d, %r10d +; AVX512DQ-NEXT: shll $27, %r10d +; AVX512DQ-NEXT: orl %r12d, %r10d +; AVX512DQ-NEXT: movl %r15d, %r12d +; AVX512DQ-NEXT: shll $28, %r12d +; AVX512DQ-NEXT: orl %r10d, %r12d +; AVX512DQ-NEXT: movl %r15d, %r13d +; AVX512DQ-NEXT: shll $29, %r13d +; AVX512DQ-NEXT: orl %r12d, %r13d +; AVX512DQ-NEXT: kmovw %k2, %r10d +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k2 +; AVX512DQ-NEXT: shll $30, %r15d +; AVX512DQ-NEXT: orl %r13d, %r15d +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $2, %k1, %k2 +; AVX512DQ-NEXT: shll $31, %eax +; AVX512DQ-NEXT: orl %r15d, %eax +; AVX512DQ-NEXT: orl %ebx, %eax +; AVX512DQ-NEXT: movl %eax, 24(%rsi) +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: movzbl %al, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leaq (%rax,%rax,2), %rbx +; AVX512DQ-NEXT: leaq (%rbx,%rax,4), %rbx +; AVX512DQ-NEXT: leaq (%rbx,%rax,8), %rbx +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $4, %r15 +; AVX512DQ-NEXT: orq %rbx, %r15 +; AVX512DQ-NEXT: movq %rax, %rbx +; AVX512DQ-NEXT: shlq $5, %rbx +; AVX512DQ-NEXT: orq %r15, %rbx +; AVX512DQ-NEXT: shlq $6, %rax +; AVX512DQ-NEXT: movzbl %r12b, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $7, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $9, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $11, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $3, %k1, %k2 +; AVX512DQ-NEXT: shlq $13, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movzbl %r12b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $15, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $17, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $18, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $19, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k2 +; AVX512DQ-NEXT: shlq $20, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movzbl %al, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $21, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $22, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $23, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $24, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $25, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $26, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $27, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movzbl %r10b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $28, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $29, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $30, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $5, %k1, %k2 +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $32, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $33, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k2 +; AVX512DQ-NEXT: shlq $34, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $35, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $36, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $37, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $38, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $39, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $40, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %r10d +; AVX512DQ-NEXT: kshiftrw $7, %k1, %k2 +; AVX512DQ-NEXT: shlq $41, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $43, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $45, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r13 +; AVX512DQ-NEXT: shlq $47, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 +; AVX512DQ-NEXT: shlq $48, %r10 +; AVX512DQ-NEXT: orq %r13, %r10 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $49, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $50, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $51, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $52, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $53, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $54, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k2 +; AVX512DQ-NEXT: shlq $55, %r12 +; AVX512DQ-NEXT: orq %r10, %r12 +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: shlq $56, %r10 +; AVX512DQ-NEXT: orq %r12, %r10 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $57, %r12 +; AVX512DQ-NEXT: orq %r10, %r12 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: shlq $58, %r10 +; AVX512DQ-NEXT: orq %r12, %r10 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $59, %r12 +; AVX512DQ-NEXT: orq %r10, %r12 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: shlq $60, %r10 +; AVX512DQ-NEXT: orq %r12, %r10 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %r10, %r12 +; AVX512DQ-NEXT: shlq $62, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %r15b, %r10d +; AVX512DQ-NEXT: movq %r10, %r15 +; AVX512DQ-NEXT: shlq $63, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: orq %rbx, %r15 +; AVX512DQ-NEXT: movq %r15, (%rsi) +; AVX512DQ-NEXT: movzbl %r11b, %r11d +; AVX512DQ-NEXT: movl %r11d, %ebx +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: leaq (%rbx,%rbx,2), %rax +; AVX512DQ-NEXT: leaq (%rax,%rbx,4), %rax +; AVX512DQ-NEXT: leaq (%rax,%rbx,8), %rax +; AVX512DQ-NEXT: movq %rbx, %r12 +; AVX512DQ-NEXT: shlq $4, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $5, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $6, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $7, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $9, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k2 +; AVX512DQ-NEXT: shlq $11, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $13, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $15, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $17, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %eax +; AVX512DQ-NEXT: kshiftrw $10, %k1, %k2 +; AVX512DQ-NEXT: shlq $18, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %al, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $19, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $20, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $21, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $22, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $23, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $24, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $25, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movzbl %r14b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $26, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $27, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $28, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $29, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $30, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k2, %ebp +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k2 +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k2 +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: shlq $32, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $33, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $34, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $35, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $36, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $37, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $38, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %r14d +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k2 +; AVX512DQ-NEXT: shlq $39, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $41, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $43, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r14, %r13 +; AVX512DQ-NEXT: shlq $45, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k2 +; AVX512DQ-NEXT: shlq $46, %r14 +; AVX512DQ-NEXT: orq %r13, %r14 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $47, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $48, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $49, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $50, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $51, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $52, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %r14d +; AVX512DQ-NEXT: kshiftrw $14, %k1, %k1 +; AVX512DQ-NEXT: shlq $53, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $55, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $56, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $57, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $58, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r14, %r12 +; AVX512DQ-NEXT: shlq $59, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k1, %r13d +; AVX512DQ-NEXT: shlq $60, %r14 +; AVX512DQ-NEXT: orq %r12, %r14 +; AVX512DQ-NEXT: movq %r9, %r12 +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %r14, %r12 +; AVX512DQ-NEXT: kmovw %k0, %r14d +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512DQ-NEXT: shlq $62, %r9 +; AVX512DQ-NEXT: orq %r12, %r9 +; AVX512DQ-NEXT: kmovw %k0, %r12d +; AVX512DQ-NEXT: movzbl %r8b, %eax +; AVX512DQ-NEXT: shlq $63, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %rax, 16(%rsi) +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: leaq (%r10,%r10,2), %rax +; AVX512DQ-NEXT: leaq (%rax,%r10,4), %rax +; AVX512DQ-NEXT: leaq (%rax,%r10,8), %rax +; AVX512DQ-NEXT: movq %r10, %r8 +; AVX512DQ-NEXT: shlq $4, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: shlq $5, %r10 +; AVX512DQ-NEXT: orq %r8, %r10 +; AVX512DQ-NEXT: movzbl %bpl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $6, %r8 +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $7, %r9 +; AVX512DQ-NEXT: orq %r8, %r9 +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $8, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $9, %r9 +; AVX512DQ-NEXT: orq %r8, %r9 +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $10, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $11, %r9 +; AVX512DQ-NEXT: orq %r8, %r9 +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movzbl %dil, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $13, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $15, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %rdi, %r8 +; AVX512DQ-NEXT: shlq $17, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $18, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: shlq $19, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: movzbl %dl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $20, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $21, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $22, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $23, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $24, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $25, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: shlq $26, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: movzbl %cl, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $27, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shlq $28, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $29, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shlq $30, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $31, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: shlq $33, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $35, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $37, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $39, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $40, %r13 +; AVX512DQ-NEXT: orq %rcx, %r13 +; AVX512DQ-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $41, %rcx +; AVX512DQ-NEXT: orq %r13, %rcx +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $42, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $43, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $44, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $45, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $46, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: shlq $47, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: andl $1, %r14d +; AVX512DQ-NEXT: movq %r14, %rcx +; AVX512DQ-NEXT: shlq $48, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $49, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r14, %rcx +; AVX512DQ-NEXT: shlq $50, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $51, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r14, %rcx +; AVX512DQ-NEXT: shlq $52, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r14, %rax +; AVX512DQ-NEXT: shlq $53, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $54, %r14 +; AVX512DQ-NEXT: orq %rax, %r14 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %r14, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $56, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $58, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $59, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $60, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %rcx, %r12 +; AVX512DQ-NEXT: shlq $62, %rbx +; AVX512DQ-NEXT: orq %r12, %rbx +; AVX512DQ-NEXT: shlq $63, %r11 +; AVX512DQ-NEXT: orq %rbx, %r11 +; AVX512DQ-NEXT: orq %r10, %r11 +; AVX512DQ-NEXT: movq %r11, 8(%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r13 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor7_vf32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r13 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovd (%rdi), %k0 +; AVX512BW-NEXT: kshiftrd $13, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $12, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: kshiftrd $11, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrd $10, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrd $27, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrd $22, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrd $19, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrd $18, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $29, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrd $30, %k0, %k1 +; AVX512BW-NEXT: movl %r9d, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: leal (%r10,%r10,2), %r14d +; AVX512BW-NEXT: leal (%r14,%r10,4), %r14d +; AVX512BW-NEXT: leal (%r14,%r10,8), %r14d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movl %r15d, %r12d +; AVX512BW-NEXT: shll $4, %r12d +; AVX512BW-NEXT: orl %r14d, %r12d +; AVX512BW-NEXT: movl %r15d, %r14d +; AVX512BW-NEXT: shll $5, %r14d +; AVX512BW-NEXT: orl %r12d, %r14d +; AVX512BW-NEXT: movl %r15d, %r12d +; AVX512BW-NEXT: shll $6, %r12d +; AVX512BW-NEXT: movl %r15d, %r13d +; AVX512BW-NEXT: shll $7, %r13d +; AVX512BW-NEXT: orl %r12d, %r13d +; AVX512BW-NEXT: movl %r15d, %r12d +; AVX512BW-NEXT: shll $8, %r12d +; AVX512BW-NEXT: orl %r13d, %r12d +; AVX512BW-NEXT: movl %r15d, %r13d +; AVX512BW-NEXT: shll $9, %r13d +; AVX512BW-NEXT: orl %r12d, %r13d +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $31, %k0, %k1 +; AVX512BW-NEXT: shll $10, %r15d +; AVX512BW-NEXT: orl %r13d, %r15d +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movl %ebx, %r13d +; AVX512BW-NEXT: shll $11, %r13d +; AVX512BW-NEXT: orl %r15d, %r13d +; AVX512BW-NEXT: movl %ebx, %r15d +; AVX512BW-NEXT: shll $12, %r15d +; AVX512BW-NEXT: orl %r13d, %r15d +; AVX512BW-NEXT: movl %ebx, %r13d +; AVX512BW-NEXT: shll $13, %r13d +; AVX512BW-NEXT: orl %r15d, %r13d +; AVX512BW-NEXT: movl %ebx, %r15d +; AVX512BW-NEXT: shll $14, %r15d +; AVX512BW-NEXT: orl %r13d, %r15d +; AVX512BW-NEXT: movl %ebx, %r13d +; AVX512BW-NEXT: shll $15, %r13d +; AVX512BW-NEXT: orl %r15d, %r13d +; AVX512BW-NEXT: movl %ebx, %eax +; AVX512BW-NEXT: shll $16, %eax +; AVX512BW-NEXT: orl %r13d, %eax +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $4, %k0, %k1 +; AVX512BW-NEXT: shll $17, %ebx +; AVX512BW-NEXT: orl %eax, %ebx +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movl %r12d, %eax +; AVX512BW-NEXT: shll $18, %eax +; AVX512BW-NEXT: orl %ebx, %eax +; AVX512BW-NEXT: movl %r12d, %ebx +; AVX512BW-NEXT: shll $19, %ebx +; AVX512BW-NEXT: orl %eax, %ebx +; AVX512BW-NEXT: movl %r12d, %eax +; AVX512BW-NEXT: shll $20, %eax +; AVX512BW-NEXT: orl %ebx, %eax +; AVX512BW-NEXT: movl %r12d, %ebx +; AVX512BW-NEXT: shll $21, %ebx +; AVX512BW-NEXT: orl %eax, %ebx +; AVX512BW-NEXT: movl %r12d, %eax +; AVX512BW-NEXT: shll $22, %eax +; AVX512BW-NEXT: orl %ebx, %eax +; AVX512BW-NEXT: movl %r12d, %ebx +; AVX512BW-NEXT: shll $23, %ebx +; AVX512BW-NEXT: orl %eax, %ebx +; AVX512BW-NEXT: shll $24, %r12d +; AVX512BW-NEXT: orl %ebx, %r12d +; AVX512BW-NEXT: movl %r15d, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movl %eax, %ebx +; AVX512BW-NEXT: shll $25, %ebx +; AVX512BW-NEXT: orl %r12d, %ebx +; AVX512BW-NEXT: movl %eax, %r12d +; AVX512BW-NEXT: shll $26, %r12d +; AVX512BW-NEXT: orl %ebx, %r12d +; AVX512BW-NEXT: movl %eax, %ebx +; AVX512BW-NEXT: shll $27, %ebx +; AVX512BW-NEXT: orl %r12d, %ebx +; AVX512BW-NEXT: movl %eax, %r12d +; AVX512BW-NEXT: shll $28, %r12d +; AVX512BW-NEXT: orl %ebx, %r12d +; AVX512BW-NEXT: movl %eax, %r13d +; AVX512BW-NEXT: shll $29, %r13d +; AVX512BW-NEXT: orl %r12d, %r13d +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrd $1, %k0, %k1 +; AVX512BW-NEXT: shll $30, %eax +; AVX512BW-NEXT: orl %r13d, %eax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $2, %k0, %k1 +; AVX512BW-NEXT: shll $31, %r15d +; AVX512BW-NEXT: orl %eax, %r15d +; AVX512BW-NEXT: orl %r14d, %r15d +; AVX512BW-NEXT: movl %r15d, 24(%rsi) +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: movzbl %al, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %r14 +; AVX512BW-NEXT: leaq (%r14,%rax,4), %r14 +; AVX512BW-NEXT: leaq (%r14,%rax,8), %r14 +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: shlq $4, %r15 +; AVX512BW-NEXT: orq %r14, %r15 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: shlq $5, %r14 +; AVX512BW-NEXT: orq %r15, %r14 +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movzbl %r12b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $9, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $11, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $3, %k0, %k1 +; AVX512BW-NEXT: shlq $13, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movzbl %r12b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $15, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $17, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $18, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $19, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrd $9, %k0, %k1 +; AVX512BW-NEXT: shlq $20, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movzbl %al, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $21, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $22, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $23, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $24, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $25, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $26, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: shlq $27, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movzbl %bl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $28, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: shlq $29, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $30, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrd $5, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %r12 +; AVX512BW-NEXT: shlq $32, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $33, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $6, %k0, %k1 +; AVX512BW-NEXT: shlq $34, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $35, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r12, %rbx +; AVX512BW-NEXT: shlq $36, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $37, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r12, %rbx +; AVX512BW-NEXT: shlq $38, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $39, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $40, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrd $7, %k0, %k1 +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %rbx, %r12 +; AVX512BW-NEXT: shlq $43, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %rbx, %r12 +; AVX512BW-NEXT: shlq $45, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $47, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $8, %k0, %k1 +; AVX512BW-NEXT: shlq $48, %rbx +; AVX512BW-NEXT: orq %r13, %rbx +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $49, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r12, %rbx +; AVX512BW-NEXT: shlq $50, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $51, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r12, %rbx +; AVX512BW-NEXT: shlq $52, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $53, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r12, %rbx +; AVX512BW-NEXT: shlq $54, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrd $20, %k0, %k1 +; AVX512BW-NEXT: shlq $55, %r12 +; AVX512BW-NEXT: orq %rbx, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $57, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $59, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $61, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: shlq $62, %r13 +; AVX512BW-NEXT: orq %rbx, %r13 +; AVX512BW-NEXT: movzbl %r15b, %ebx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $63, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %rax, (%rsi) +; AVX512BW-NEXT: movzbl %cl, %r14d +; AVX512BW-NEXT: movl %r14d, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %r15 +; AVX512BW-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512BW-NEXT: leaq (%r15,%rax,8), %r15 +; AVX512BW-NEXT: shlq $4, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $5, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $9, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrd $21, %k0, %k1 +; AVX512BW-NEXT: shlq $11, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $13, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $15, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $17, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrd $23, %k0, %k1 +; AVX512BW-NEXT: shlq $18, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $19, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $20, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $21, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $22, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $23, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $24, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: shlq $25, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movzbl %r11b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $26, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $27, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $28, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $29, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrd $25, %k0, %k1 +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: shlq $32, %r11 +; AVX512BW-NEXT: orq %r12, %r11 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $33, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r13, %r11 +; AVX512BW-NEXT: shlq $34, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $35, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r13, %r11 +; AVX512BW-NEXT: shlq $36, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $37, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $38, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrd $26, %k0, %k1 +; AVX512BW-NEXT: shlq $39, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbp, %r12 +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %rbp, %r12 +; AVX512BW-NEXT: shlq $43, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %rbp, %r13 +; AVX512BW-NEXT: shlq $45, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $14, %k0, %k1 +; AVX512BW-NEXT: shlq $46, %rbp +; AVX512BW-NEXT: orq %r13, %rbp +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $47, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: movq %r11, %r13 +; AVX512BW-NEXT: shlq $48, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $49, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r11, %r13 +; AVX512BW-NEXT: shlq $50, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $51, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r11, %r13 +; AVX512BW-NEXT: shlq $52, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrd $15, %k0, %k1 +; AVX512BW-NEXT: shlq $53, %r11 +; AVX512BW-NEXT: orq %r13, %r11 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r12, %r11 +; AVX512BW-NEXT: shlq $55, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r12, %r11 +; AVX512BW-NEXT: shlq $57, %r11 +; AVX512BW-NEXT: orq %rax, %r11 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $59, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 +; AVX512BW-NEXT: shlq $60, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r10, %r13 +; AVX512BW-NEXT: shlq $61, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrd $17, %k0, %k0 +; AVX512BW-NEXT: shlq $62, %r10 +; AVX512BW-NEXT: orq %r13, %r10 +; AVX512BW-NEXT: kmovd %k0, %r13d +; AVX512BW-NEXT: movzbl %r9b, %eax +; AVX512BW-NEXT: shlq $63, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %rax, 16(%rsi) +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: leaq (%rbx,%rbx,2), %rax +; AVX512BW-NEXT: leaq (%rax,%rbx,4), %rax +; AVX512BW-NEXT: leaq (%rax,%rbx,8), %rax +; AVX512BW-NEXT: movq %rbx, %r9 +; AVX512BW-NEXT: shlq $4, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: shlq $5, %rbx +; AVX512BW-NEXT: orq %r9, %rbx +; AVX512BW-NEXT: movzbl %r8b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $6, %r8 +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $7, %r9 +; AVX512BW-NEXT: orq %r8, %r9 +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $8, %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $9, %r9 +; AVX512BW-NEXT: orq %r8, %r9 +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $10, %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $11, %r9 +; AVX512BW-NEXT: orq %r8, %r9 +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movzbl %dil, %edi +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $13, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $15, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %rdi, %r8 +; AVX512BW-NEXT: shlq $17, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: shlq $18, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: shlq $19, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movzbl %dl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $20, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $21, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $22, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $23, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $24, %rdx +; AVX512BW-NEXT: orq %rdi, %rdx +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $25, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: shlq $26, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $27, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $28, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $29, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $31, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: shlq $33, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %rbp, %rdx +; AVX512BW-NEXT: shlq $35, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %rbp, %rdx +; AVX512BW-NEXT: shlq $37, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %rbp, %rdx +; AVX512BW-NEXT: shlq $39, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: shlq $40, %rbp +; AVX512BW-NEXT: orq %rdx, %rbp +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $41, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: movq %r11, %rdx +; AVX512BW-NEXT: shlq $42, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $43, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r11, %rdx +; AVX512BW-NEXT: shlq $44, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $45, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r11, %rdx +; AVX512BW-NEXT: shlq $46, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: shlq $47, %r11 +; AVX512BW-NEXT: orq %rdx, %r11 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r11, %rax +; AVX512BW-NEXT: movq %r12, %rdx +; AVX512BW-NEXT: shlq $49, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r12, %rdx +; AVX512BW-NEXT: shlq $51, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r12, %rdx +; AVX512BW-NEXT: shlq $53, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: shlq $54, %r12 +; AVX512BW-NEXT: orq %rdx, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $55, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rdx +; AVX512BW-NEXT: shlq $56, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $57, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r13, %rdx +; AVX512BW-NEXT: shlq $58, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $59, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %r13, %rdx +; AVX512BW-NEXT: shlq $60, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: shlq $61, %r13 +; AVX512BW-NEXT: orq %rdx, %r13 +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: shlq $62, %rcx +; AVX512BW-NEXT: orq %r13, %rcx +; AVX512BW-NEXT: shlq $63, %r14 +; AVX512BW-NEXT: orq %rcx, %r14 +; AVX512BW-NEXT: orq %rbx, %r14 +; AVX512BW-NEXT: movq %r14, 8(%rsi) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r13 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: retq + %src.vec = load <32 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <224 x i32> + store <224 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor7_vf64(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor7_vf64: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: pushq %rbp +; AVX512F-ONLY-NEXT: pushq %r15 +; AVX512F-ONLY-NEXT: pushq %r14 +; AVX512F-ONLY-NEXT: pushq %r13 +; AVX512F-ONLY-NEXT: pushq %r12 +; AVX512F-ONLY-NEXT: pushq %rbx +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k0 +; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k2 +; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k3 +; AVX512F-ONLY-NEXT: kshiftrw $15, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512F-ONLY-NEXT: kshiftrw $13, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %ecx +; AVX512F-ONLY-NEXT: kshiftrw $12, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %edx +; AVX512F-ONLY-NEXT: kshiftrw $11, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %edi +; AVX512F-ONLY-NEXT: kshiftrw $13, %k2, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebx +; AVX512F-ONLY-NEXT: kshiftrw $5, %k2, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $11, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k3, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r11d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $2, %k1, %k4 +; AVX512F-ONLY-NEXT: kmovw %k1, %r14d +; AVX512F-ONLY-NEXT: movzbl %r14b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: leaq (%r12,%r12,2), %r14 +; AVX512F-ONLY-NEXT: leaq (%r14,%r12,4), %r14 +; AVX512F-ONLY-NEXT: leaq (%r14,%r12,8), %r14 +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $4, %r15 +; AVX512F-ONLY-NEXT: orq %r14, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $5, %r14 +; AVX512F-ONLY-NEXT: orq %r15, %r14 +; AVX512F-ONLY-NEXT: shlq $6, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $7, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $8, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $9, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $10, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $11, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $12, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $3, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $13, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $15, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $17, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $18, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $19, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $7, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $20, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $21, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $22, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $23, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $24, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $25, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $26, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $27, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movzbl %r8b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $28, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $29, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $30, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %r12 +; AVX512F-ONLY-NEXT: shlq $32, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $34, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r8 +; AVX512F-ONLY-NEXT: shlq $36, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $37, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r8 +; AVX512F-ONLY-NEXT: shlq $38, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $39, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $40, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $7, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $41, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r8, %r12 +; AVX512F-ONLY-NEXT: shlq $43, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r8, %r12 +; AVX512F-ONLY-NEXT: shlq $45, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r8, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r8, %r13 +; AVX512F-ONLY-NEXT: shlq $47, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $48, %r8 +; AVX512F-ONLY-NEXT: orq %r13, %r8 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $49, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r8 +; AVX512F-ONLY-NEXT: shlq $50, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $51, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r8 +; AVX512F-ONLY-NEXT: shlq $52, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $53, %rax +; AVX512F-ONLY-NEXT: orq %r8, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r8 +; AVX512F-ONLY-NEXT: shlq $54, %r8 +; AVX512F-ONLY-NEXT: orq %rax, %r8 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $9, %k1, %k4 +; AVX512F-ONLY-NEXT: shlq $55, %r12 +; AVX512F-ONLY-NEXT: orq %r8, %r12 +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $56, %r8 +; AVX512F-ONLY-NEXT: orq %r12, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $57, %r12 +; AVX512F-ONLY-NEXT: orq %r8, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $58, %r8 +; AVX512F-ONLY-NEXT: orq %r12, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $59, %r12 +; AVX512F-ONLY-NEXT: orq %r8, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r8 +; AVX512F-ONLY-NEXT: shlq $60, %r8 +; AVX512F-ONLY-NEXT: orq %r12, %r8 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %r8, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %r8d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $62, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %r8b, %r8d +; AVX512F-ONLY-NEXT: movq %r8, %r12 +; AVX512F-ONLY-NEXT: shlq $63, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: orq %r14, %r12 +; AVX512F-ONLY-NEXT: movq %r12, (%rsi) +; AVX512F-ONLY-NEXT: movzbl %r15b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movzbl %r11b, %r14d +; AVX512F-ONLY-NEXT: movl %r14d, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: leaq (%r15,%rax,2), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%rax,8), %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $5, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $6, %r12 +; AVX512F-ONLY-NEXT: shlq $7, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movzbl %r10b, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $8, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $9, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $10, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $11, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $12, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $13, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $14, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movzbl %r12b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $15, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $16, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $17, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $18, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $19, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $20, %r10 +; AVX512F-ONLY-NEXT: orq %rax, %r10 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $15, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $21, %r12 +; AVX512F-ONLY-NEXT: orq %r10, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r10d +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $22, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $23, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $24, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $25, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %rax +; AVX512F-ONLY-NEXT: shlq $26, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $27, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $28, %r10 +; AVX512F-ONLY-NEXT: orq %r12, %r10 +; AVX512F-ONLY-NEXT: movzbl %r9b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $29, %r12 +; AVX512F-ONLY-NEXT: orq %r10, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $30, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r10d +; AVX512F-ONLY-NEXT: kshiftrw $12, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %r12 +; AVX512F-ONLY-NEXT: shlq $32, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r9, %r13 +; AVX512F-ONLY-NEXT: shlq $34, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $35, %r9 +; AVX512F-ONLY-NEXT: orq %r13, %r9 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r9 +; AVX512F-ONLY-NEXT: shlq $37, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r9 +; AVX512F-ONLY-NEXT: shlq $39, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r9 +; AVX512F-ONLY-NEXT: shlq $41, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $42, %r12 +; AVX512F-ONLY-NEXT: orq %r9, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $43, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r9 +; AVX512F-ONLY-NEXT: shlq $44, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $45, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r9 +; AVX512F-ONLY-NEXT: shlq $46, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $47, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $48, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $4, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $49, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r9, %r12 +; AVX512F-ONLY-NEXT: shlq $51, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r9, %r12 +; AVX512F-ONLY-NEXT: shlq $53, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r9, %r12 +; AVX512F-ONLY-NEXT: shlq $55, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $56, %r9 +; AVX512F-ONLY-NEXT: orq %r12, %r9 +; AVX512F-ONLY-NEXT: movzbl %r10b, %eax +; AVX512F-ONLY-NEXT: # kill: def $r10d killed $r10d def $r10 +; AVX512F-ONLY-NEXT: andl $1, %r10d +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $57, %r12 +; AVX512F-ONLY-NEXT: orq %r9, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %r9 +; AVX512F-ONLY-NEXT: shlq $58, %r9 +; AVX512F-ONLY-NEXT: orq %r12, %r9 +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $59, %r12 +; AVX512F-ONLY-NEXT: orq %r9, %r12 +; AVX512F-ONLY-NEXT: movq %r10, %r9 +; AVX512F-ONLY-NEXT: shlq $60, %r9 +; AVX512F-ONLY-NEXT: orq %r12, %r9 +; AVX512F-ONLY-NEXT: movq %r10, %r12 +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %r9, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %r9d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $62, %r10 +; AVX512F-ONLY-NEXT: orq %r12, %r10 +; AVX512F-ONLY-NEXT: shlq $63, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %rax, 48(%rsi) +; AVX512F-ONLY-NEXT: movzbl %r9b, %r10d +; AVX512F-ONLY-NEXT: movl %r10d, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leaq (%rax,%rax,2), %r15 +; AVX512F-ONLY-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512F-ONLY-NEXT: movzbl %bpl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leaq (%r15,%rax,8), %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $4, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $5, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $6, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $7, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $8, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $7, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $9, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movzbl %bpl, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $10, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $11, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $12, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $13, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $14, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $15, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $8, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $16, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $17, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $18, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $19, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $20, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $21, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $22, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $9, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $23, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: movzbl %al, %ebp +; AVX512F-ONLY-NEXT: andl $1, %ebp +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $24, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %r15 +; AVX512F-ONLY-NEXT: shlq $25, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $26, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %r15 +; AVX512F-ONLY-NEXT: shlq $27, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $28, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %r13 +; AVX512F-ONLY-NEXT: shlq $29, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $30, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: movzbl %r15b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %rbp, %rax +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $32, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $34, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rbp +; AVX512F-ONLY-NEXT: shlq $36, %rbp +; AVX512F-ONLY-NEXT: orq %rax, %rbp +; AVX512F-ONLY-NEXT: kmovw %k4, %r13d +; AVX512F-ONLY-NEXT: kshiftrw $11, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $37, %r15 +; AVX512F-ONLY-NEXT: orq %rbp, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $39, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $41, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r15 +; AVX512F-ONLY-NEXT: shlq $43, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $12, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $44, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: andl $1, %ebp +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $45, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %r15 +; AVX512F-ONLY-NEXT: shlq $46, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $47, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %r15 +; AVX512F-ONLY-NEXT: shlq $48, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $49, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %r13 +; AVX512F-ONLY-NEXT: shlq $50, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $6, %k0, %k4 +; AVX512F-ONLY-NEXT: shlq $51, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $52, %rax +; AVX512F-ONLY-NEXT: orq %rbp, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $53, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $55, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $56, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $57, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $58, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: movzbl %bl, %r13d +; AVX512F-ONLY-NEXT: movl %ebx, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $59, %rbx +; AVX512F-ONLY-NEXT: orq %r15, %rbx +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $60, %r15 +; AVX512F-ONLY-NEXT: orq %rbx, %r15 +; AVX512F-ONLY-NEXT: movq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $61, %rbx +; AVX512F-ONLY-NEXT: orq %r15, %rbx +; AVX512F-ONLY-NEXT: kmovw %k4, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k2, %k4 +; AVX512F-ONLY-NEXT: shlq $62, %rax +; AVX512F-ONLY-NEXT: orq %rbx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rbx +; AVX512F-ONLY-NEXT: shlq $63, %rbx +; AVX512F-ONLY-NEXT: orq %rax, %rbx +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $15, %k2, %k4 +; AVX512F-ONLY-NEXT: orq %r12, %rbx +; AVX512F-ONLY-NEXT: movq %rbx, 32(%rsi) +; AVX512F-ONLY-NEXT: movzbl %al, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: leaq (%r13,%r13,2), %rbx +; AVX512F-ONLY-NEXT: leaq (%rbx,%rax,4), %rbx +; AVX512F-ONLY-NEXT: leaq (%rbx,%rax,8), %rbx +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r12 +; AVX512F-ONLY-NEXT: orq %rbx, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %rbx +; AVX512F-ONLY-NEXT: shlq $5, %rbx +; AVX512F-ONLY-NEXT: orq %r12, %rbx +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $6, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $7, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $1, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $9, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $11, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $13, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $15, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: movzbl %al, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $17, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $18, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $19, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $20, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $21, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $2, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $22, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $23, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $24, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $25, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $26, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $27, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $28, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %eax +; AVX512F-ONLY-NEXT: kshiftrw $3, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $29, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $30, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $31, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $33, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $35, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k4, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $4, %k3, %k4 +; AVX512F-ONLY-NEXT: shlq $36, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: andl $1, %ebp +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $37, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %r12 +; AVX512F-ONLY-NEXT: shlq $38, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $39, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %r12 +; AVX512F-ONLY-NEXT: shlq $40, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %rbp, %rax +; AVX512F-ONLY-NEXT: shlq $41, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %rbp, %r13 +; AVX512F-ONLY-NEXT: shlq $42, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k4, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $5, %k3, %k3 +; AVX512F-ONLY-NEXT: shlq $43, %rbp +; AVX512F-ONLY-NEXT: orq %r13, %rbp +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %rbp, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $45, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $47, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $49, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $3, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $50, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $51, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $52, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $53, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $54, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $56, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $2, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $57, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: andl $1, %r11d +; AVX512F-ONLY-NEXT: movq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $58, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r11, %r12 +; AVX512F-ONLY-NEXT: shlq $59, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r11, %rax +; AVX512F-ONLY-NEXT: shlq $60, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r11, %r12 +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $4, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $62, %r11 +; AVX512F-ONLY-NEXT: orq %r12, %r11 +; AVX512F-ONLY-NEXT: shlq $63, %r14 +; AVX512F-ONLY-NEXT: orq %r11, %r14 +; AVX512F-ONLY-NEXT: orq %rbx, %r14 +; AVX512F-ONLY-NEXT: movq %r14, 40(%rsi) +; AVX512F-ONLY-NEXT: movzbl %al, %r11d +; AVX512F-ONLY-NEXT: movl %r11d, %ebx +; AVX512F-ONLY-NEXT: andl $1, %ebx +; AVX512F-ONLY-NEXT: leaq (%rbx,%rbx,2), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%rbx,4), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%rbx,8), %rax +; AVX512F-ONLY-NEXT: movq %rbx, %r12 +; AVX512F-ONLY-NEXT: shlq $4, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %r14 +; AVX512F-ONLY-NEXT: shlq $5, %r14 +; AVX512F-ONLY-NEXT: orq %r12, %r14 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $6, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $7, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $9, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $10, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $5, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $11, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movzbl %bpl, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $13, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $15, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r13, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $17, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $11, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $18, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r13d +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $19, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $20, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $21, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $22, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $23, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r13, %r12 +; AVX512F-ONLY-NEXT: shlq $24, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $25, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movzbl %r15b, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $26, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $27, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $28, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r13 +; AVX512F-ONLY-NEXT: shlq $29, %r13 +; AVX512F-ONLY-NEXT: orq %r12, %r13 +; AVX512F-ONLY-NEXT: movq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $30, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $7, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $31, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: kmovw %k3, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $8, %k0, %k3 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: shlq $32, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $34, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $35, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $36, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $37, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $38, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $9, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $39, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $40, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $41, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $42, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $43, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $44, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r13 +; AVX512F-ONLY-NEXT: shlq $45, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $46, %r15 +; AVX512F-ONLY-NEXT: orq %r13, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $47, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $48, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $49, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $50, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $51, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $52, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $10, %k1, %k3 +; AVX512F-ONLY-NEXT: shlq $53, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $54, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $55, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $56, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $57, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $58, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $59, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $60, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movzbl %bpl, %eax +; AVX512F-ONLY-NEXT: movl %ebp, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $61, %r13 +; AVX512F-ONLY-NEXT: orq %r15, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %ebp +; AVX512F-ONLY-NEXT: kshiftrw $12, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $62, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: movq %rax, %r15 +; AVX512F-ONLY-NEXT: shlq $63, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: kmovw %k3, %r12d +; AVX512F-ONLY-NEXT: kshiftrw $13, %k0, %k3 +; AVX512F-ONLY-NEXT: orq %r14, %r15 +; AVX512F-ONLY-NEXT: movq %r15, 16(%rsi) +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: leaq (%rax,%rax,2), %r14 +; AVX512F-ONLY-NEXT: leaq (%r14,%rax,4), %r14 +; AVX512F-ONLY-NEXT: leaq (%r14,%rax,8), %rax +; AVX512F-ONLY-NEXT: movzbl %r12b, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $4, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %r14 +; AVX512F-ONLY-NEXT: shlq $5, %r14 +; AVX512F-ONLY-NEXT: orq %r15, %r14 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $6, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $7, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $8, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $9, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $14, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $10, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $11, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $12, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $13, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $14, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $15, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $16, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $15, %k0, %k3 +; AVX512F-ONLY-NEXT: shlq $17, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $18, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $19, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $20, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $21, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $22, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $23, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $1, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $24, %r12 +; AVX512F-ONLY-NEXT: orq %r15, %r12 +; AVX512F-ONLY-NEXT: movzbl %al, %r15d +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $25, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $26, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $27, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $28, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $29, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $30, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: shlq $31, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: kmovw %k2, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $33, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $35, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $37, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k3, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $2, %k2, %k3 +; AVX512F-ONLY-NEXT: shlq $38, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $39, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $40, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $41, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $42, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $43, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $44, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k3, %eax +; AVX512F-ONLY-NEXT: kshiftrw $3, %k2, %k2 +; AVX512F-ONLY-NEXT: shlq $45, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: movzbl %al, %r12d +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $46, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $47, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $48, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r15 +; AVX512F-ONLY-NEXT: shlq $49, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $50, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %r13 +; AVX512F-ONLY-NEXT: shlq $51, %r13 +; AVX512F-ONLY-NEXT: orq %rax, %r13 +; AVX512F-ONLY-NEXT: kmovw %k2, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $14, %k1, %k1 +; AVX512F-ONLY-NEXT: shlq $52, %r12 +; AVX512F-ONLY-NEXT: orq %r13, %r12 +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $53, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $54, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $56, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %r12, %rax +; AVX512F-ONLY-NEXT: movq %r15, %r12 +; AVX512F-ONLY-NEXT: shlq $58, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k1, %r13d +; AVX512F-ONLY-NEXT: shlq $59, %r15 +; AVX512F-ONLY-NEXT: orq %r12, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r9d +; AVX512F-ONLY-NEXT: movq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $60, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r9, %r12 +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %rax, %r12 +; AVX512F-ONLY-NEXT: kmovw %k0, %r15d +; AVX512F-ONLY-NEXT: kshiftrw $1, %k0, %k0 +; AVX512F-ONLY-NEXT: shlq $62, %r9 +; AVX512F-ONLY-NEXT: orq %r12, %r9 +; AVX512F-ONLY-NEXT: kmovw %k0, %r12d +; AVX512F-ONLY-NEXT: shlq $63, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: orq %r14, %r10 +; AVX512F-ONLY-NEXT: movq %r10, 24(%rsi) +; AVX512F-ONLY-NEXT: andl $1, %r8d +; AVX512F-ONLY-NEXT: leaq (%r8,%r8,2), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%r8,4), %rax +; AVX512F-ONLY-NEXT: leaq (%rax,%r8,8), %rax +; AVX512F-ONLY-NEXT: movq %r8, %r9 +; AVX512F-ONLY-NEXT: shlq $4, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $5, %r8 +; AVX512F-ONLY-NEXT: orq %r9, %r8 +; AVX512F-ONLY-NEXT: movzbl %bpl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $6, %r9 +; AVX512F-ONLY-NEXT: movq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $7, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $8, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $9, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: movq %rax, %r9 +; AVX512F-ONLY-NEXT: shlq $10, %r9 +; AVX512F-ONLY-NEXT: orq %r10, %r9 +; AVX512F-ONLY-NEXT: movq %rax, %r10 +; AVX512F-ONLY-NEXT: shlq $11, %r10 +; AVX512F-ONLY-NEXT: orq %r9, %r10 +; AVX512F-ONLY-NEXT: shlq $12, %rax +; AVX512F-ONLY-NEXT: orq %r10, %rax +; AVX512F-ONLY-NEXT: movzbl %dil, %edi +; AVX512F-ONLY-NEXT: andl $1, %edi +; AVX512F-ONLY-NEXT: movq %rdi, %r9 +; AVX512F-ONLY-NEXT: shlq $13, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $14, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %rdi, %r9 +; AVX512F-ONLY-NEXT: shlq $15, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $16, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: movq %rdi, %r9 +; AVX512F-ONLY-NEXT: shlq $17, %r9 +; AVX512F-ONLY-NEXT: orq %rax, %r9 +; AVX512F-ONLY-NEXT: movq %rdi, %rax +; AVX512F-ONLY-NEXT: shlq $18, %rax +; AVX512F-ONLY-NEXT: orq %r9, %rax +; AVX512F-ONLY-NEXT: shlq $19, %rdi +; AVX512F-ONLY-NEXT: orq %rax, %rdi +; AVX512F-ONLY-NEXT: movzbl %dl, %eax +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $20, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $21, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $22, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $23, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $24, %rdx +; AVX512F-ONLY-NEXT: orq %rdi, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rdi +; AVX512F-ONLY-NEXT: shlq $25, %rdi +; AVX512F-ONLY-NEXT: orq %rdx, %rdi +; AVX512F-ONLY-NEXT: shlq $26, %rax +; AVX512F-ONLY-NEXT: orq %rdi, %rax +; AVX512F-ONLY-NEXT: movzbl %cl, %ecx +; AVX512F-ONLY-NEXT: andl $1, %ecx +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $27, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $28, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $29, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $30, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: movq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $31, %rdx +; AVX512F-ONLY-NEXT: orq %rax, %rdx +; AVX512F-ONLY-NEXT: movq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $32, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: shlq $33, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: andl $1, %r13d +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $34, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $35, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $36, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $37, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r13, %rax +; AVX512F-ONLY-NEXT: shlq $38, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r13, %rcx +; AVX512F-ONLY-NEXT: shlq $39, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $40, %r13 +; AVX512F-ONLY-NEXT: orq %rcx, %r13 +; AVX512F-ONLY-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512F-ONLY-NEXT: andl $1, %eax +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $41, %rcx +; AVX512F-ONLY-NEXT: orq %r13, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $42, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $43, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $44, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: movq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $45, %rcx +; AVX512F-ONLY-NEXT: orq %rdx, %rcx +; AVX512F-ONLY-NEXT: movq %rax, %rdx +; AVX512F-ONLY-NEXT: shlq $46, %rdx +; AVX512F-ONLY-NEXT: orq %rcx, %rdx +; AVX512F-ONLY-NEXT: shlq $47, %rax +; AVX512F-ONLY-NEXT: orq %rdx, %rax +; AVX512F-ONLY-NEXT: andl $1, %r15d +; AVX512F-ONLY-NEXT: movq %r15, %rcx +; AVX512F-ONLY-NEXT: shlq $48, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $49, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rcx +; AVX512F-ONLY-NEXT: shlq $50, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $51, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r15, %rcx +; AVX512F-ONLY-NEXT: shlq $52, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r15, %rax +; AVX512F-ONLY-NEXT: shlq $53, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: shlq $54, %r15 +; AVX512F-ONLY-NEXT: orq %rax, %r15 +; AVX512F-ONLY-NEXT: andl $1, %r12d +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $55, %rax +; AVX512F-ONLY-NEXT: orq %r15, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $56, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $57, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $58, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: movq %r12, %rax +; AVX512F-ONLY-NEXT: shlq $59, %rax +; AVX512F-ONLY-NEXT: orq %rcx, %rax +; AVX512F-ONLY-NEXT: movq %r12, %rcx +; AVX512F-ONLY-NEXT: shlq $60, %rcx +; AVX512F-ONLY-NEXT: orq %rax, %rcx +; AVX512F-ONLY-NEXT: shlq $61, %r12 +; AVX512F-ONLY-NEXT: orq %rcx, %r12 +; AVX512F-ONLY-NEXT: shlq $62, %rbx +; AVX512F-ONLY-NEXT: orq %r12, %rbx +; AVX512F-ONLY-NEXT: shlq $63, %r11 +; AVX512F-ONLY-NEXT: orq %rbx, %r11 +; AVX512F-ONLY-NEXT: orq %r8, %r11 +; AVX512F-ONLY-NEXT: movq %r11, 8(%rsi) +; AVX512F-ONLY-NEXT: popq %rbx +; AVX512F-ONLY-NEXT: popq %r12 +; AVX512F-ONLY-NEXT: popq %r13 +; AVX512F-ONLY-NEXT: popq %r14 +; AVX512F-ONLY-NEXT: popq %r15 +; AVX512F-ONLY-NEXT: popq %rbp +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor7_vf64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %r13 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: kmovw (%rdi), %k1 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k2 +; AVX512DQ-NEXT: kmovw 6(%rdi), %k3 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQ-NEXT: kshiftrw $13, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %ecx +; AVX512DQ-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %edx +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %edi +; AVX512DQ-NEXT: kshiftrw $13, %k2, %k4 +; AVX512DQ-NEXT: kmovw %k4, %ebx +; AVX512DQ-NEXT: kshiftrw $5, %k2, %k4 +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r9d +; AVX512DQ-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r10d +; AVX512DQ-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r11d +; AVX512DQ-NEXT: kshiftrw $4, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %r8d +; AVX512DQ-NEXT: kshiftrw $1, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $2, %k1, %k4 +; AVX512DQ-NEXT: kmovw %k1, %r14d +; AVX512DQ-NEXT: movzbl %r14b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: leaq (%r12,%r12,2), %r14 +; AVX512DQ-NEXT: leaq (%r14,%r12,4), %r14 +; AVX512DQ-NEXT: leaq (%r14,%r12,8), %r14 +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $4, %r15 +; AVX512DQ-NEXT: orq %r14, %r15 +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $5, %r14 +; AVX512DQ-NEXT: orq %r15, %r14 +; AVX512DQ-NEXT: shlq $6, %r12 +; AVX512DQ-NEXT: movzbl %al, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $7, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $8, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $9, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $10, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $11, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $12, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $3, %k1, %k4 +; AVX512DQ-NEXT: shlq $13, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $15, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $17, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $18, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $19, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQ-NEXT: shlq $20, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movzbl %al, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $21, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $22, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $23, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $24, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $25, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $26, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $27, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movzbl %r8b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $28, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $29, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $30, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $5, %k1, %k4 +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %r12 +; AVX512DQ-NEXT: shlq $32, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $33, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: shlq $34, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $35, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %r12, %r8 +; AVX512DQ-NEXT: shlq $36, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $37, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %r12, %r8 +; AVX512DQ-NEXT: shlq $38, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $39, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $40, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r8d +; AVX512DQ-NEXT: kshiftrw $7, %k1, %k4 +; AVX512DQ-NEXT: shlq $41, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r8, %r12 +; AVX512DQ-NEXT: shlq $43, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r8, %r12 +; AVX512DQ-NEXT: shlq $45, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r8, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r8, %r13 +; AVX512DQ-NEXT: shlq $47, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k4 +; AVX512DQ-NEXT: shlq $48, %r8 +; AVX512DQ-NEXT: orq %r13, %r8 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $49, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %r12, %r8 +; AVX512DQ-NEXT: shlq $50, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $51, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %r12, %r8 +; AVX512DQ-NEXT: shlq $52, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $53, %rax +; AVX512DQ-NEXT: orq %r8, %rax +; AVX512DQ-NEXT: movq %r12, %r8 +; AVX512DQ-NEXT: shlq $54, %r8 +; AVX512DQ-NEXT: orq %rax, %r8 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k4 +; AVX512DQ-NEXT: shlq $55, %r12 +; AVX512DQ-NEXT: orq %r8, %r12 +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $56, %r8 +; AVX512DQ-NEXT: orq %r12, %r8 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $57, %r12 +; AVX512DQ-NEXT: orq %r8, %r12 +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $58, %r8 +; AVX512DQ-NEXT: orq %r12, %r8 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $59, %r12 +; AVX512DQ-NEXT: orq %r8, %r12 +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: shlq $60, %r8 +; AVX512DQ-NEXT: orq %r12, %r8 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %r8, %r12 +; AVX512DQ-NEXT: kmovw %k4, %r8d +; AVX512DQ-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQ-NEXT: shlq $62, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %r8b, %r8d +; AVX512DQ-NEXT: movq %r8, %r12 +; AVX512DQ-NEXT: shlq $63, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: orq %r14, %r12 +; AVX512DQ-NEXT: movq %r12, (%rsi) +; AVX512DQ-NEXT: movzbl %r15b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movzbl %r11b, %r14d +; AVX512DQ-NEXT: movl %r14d, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: leaq (%r15,%rax,2), %r15 +; AVX512DQ-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512DQ-NEXT: leaq (%r15,%rax,8), %r15 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $4, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $5, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $6, %r12 +; AVX512DQ-NEXT: shlq $7, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movzbl %r10b, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $8, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $9, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $10, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $11, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $12, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $13, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQ-NEXT: shlq $14, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movzbl %r12b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $15, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $16, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $17, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $18, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $19, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movq %r12, %r10 +; AVX512DQ-NEXT: shlq $20, %r10 +; AVX512DQ-NEXT: orq %rax, %r10 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $15, %k3, %k4 +; AVX512DQ-NEXT: shlq $21, %r12 +; AVX512DQ-NEXT: orq %r10, %r12 +; AVX512DQ-NEXT: movzbl %al, %r10d +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $22, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $23, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $24, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $25, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r10, %rax +; AVX512DQ-NEXT: shlq $26, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $27, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $28, %r10 +; AVX512DQ-NEXT: orq %r12, %r10 +; AVX512DQ-NEXT: movzbl %r9b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $29, %r12 +; AVX512DQ-NEXT: orq %r10, %r12 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $30, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r10d +; AVX512DQ-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %r12 +; AVX512DQ-NEXT: shlq $32, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r9, %rax +; AVX512DQ-NEXT: shlq $33, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r9, %r13 +; AVX512DQ-NEXT: shlq $34, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQ-NEXT: shlq $35, %r9 +; AVX512DQ-NEXT: orq %r13, %r9 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %r12, %r9 +; AVX512DQ-NEXT: shlq $37, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %r12, %r9 +; AVX512DQ-NEXT: shlq $39, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %r12, %r9 +; AVX512DQ-NEXT: shlq $41, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQ-NEXT: shlq $42, %r12 +; AVX512DQ-NEXT: orq %r9, %r12 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $43, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r9 +; AVX512DQ-NEXT: shlq $44, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $45, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %r13, %r9 +; AVX512DQ-NEXT: shlq $46, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $47, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $48, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %r9d +; AVX512DQ-NEXT: kshiftrw $4, %k2, %k4 +; AVX512DQ-NEXT: shlq $49, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r9, %r12 +; AVX512DQ-NEXT: shlq $51, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r9, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r9, %r12 +; AVX512DQ-NEXT: shlq $53, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r9, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r9, %r12 +; AVX512DQ-NEXT: shlq $55, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $56, %r9 +; AVX512DQ-NEXT: orq %r12, %r9 +; AVX512DQ-NEXT: movzbl %r10b, %eax +; AVX512DQ-NEXT: # kill: def $r10d killed $r10d def $r10 +; AVX512DQ-NEXT: andl $1, %r10d +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $57, %r12 +; AVX512DQ-NEXT: orq %r9, %r12 +; AVX512DQ-NEXT: movq %r10, %r9 +; AVX512DQ-NEXT: shlq $58, %r9 +; AVX512DQ-NEXT: orq %r12, %r9 +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $59, %r12 +; AVX512DQ-NEXT: orq %r9, %r12 +; AVX512DQ-NEXT: movq %r10, %r9 +; AVX512DQ-NEXT: shlq $60, %r9 +; AVX512DQ-NEXT: orq %r12, %r9 +; AVX512DQ-NEXT: movq %r10, %r12 +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %r9, %r12 +; AVX512DQ-NEXT: kmovw %k4, %r9d +; AVX512DQ-NEXT: kshiftrw $6, %k2, %k4 +; AVX512DQ-NEXT: shlq $62, %r10 +; AVX512DQ-NEXT: orq %r12, %r10 +; AVX512DQ-NEXT: shlq $63, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %rax, 48(%rsi) +; AVX512DQ-NEXT: movzbl %r9b, %r10d +; AVX512DQ-NEXT: movl %r10d, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leaq (%rax,%rax,2), %r15 +; AVX512DQ-NEXT: leaq (%r15,%rax,4), %r15 +; AVX512DQ-NEXT: movzbl %bpl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leaq (%r15,%rax,8), %r15 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $4, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $5, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $6, %r15 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $7, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $8, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $7, %k2, %k4 +; AVX512DQ-NEXT: shlq $9, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movzbl %bpl, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $10, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $11, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $12, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $13, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $14, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $15, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $8, %k2, %k4 +; AVX512DQ-NEXT: shlq $16, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $17, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $18, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $19, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $20, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $21, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $22, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $9, %k2, %k4 +; AVX512DQ-NEXT: shlq $23, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: movzbl %al, %ebp +; AVX512DQ-NEXT: andl $1, %ebp +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $24, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %rbp, %r15 +; AVX512DQ-NEXT: shlq $25, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $26, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %rbp, %r15 +; AVX512DQ-NEXT: shlq $27, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $28, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %rbp, %r13 +; AVX512DQ-NEXT: shlq $29, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $10, %k2, %k4 +; AVX512DQ-NEXT: shlq $30, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: movzbl %r15b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %rbp, %rax +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $32, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $33, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $34, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $35, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r15, %rbp +; AVX512DQ-NEXT: shlq $36, %rbp +; AVX512DQ-NEXT: orq %rax, %rbp +; AVX512DQ-NEXT: kmovw %k4, %r13d +; AVX512DQ-NEXT: kshiftrw $11, %k2, %k4 +; AVX512DQ-NEXT: shlq $37, %r15 +; AVX512DQ-NEXT: orq %rbp, %r15 +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $39, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $41, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r13, %r15 +; AVX512DQ-NEXT: shlq $43, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $12, %k2, %k4 +; AVX512DQ-NEXT: shlq $44, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: andl $1, %ebp +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $45, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %rbp, %r15 +; AVX512DQ-NEXT: shlq $46, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $47, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %rbp, %r15 +; AVX512DQ-NEXT: shlq $48, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $49, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %rbp, %r13 +; AVX512DQ-NEXT: shlq $50, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k4 +; AVX512DQ-NEXT: shlq $51, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $52, %rax +; AVX512DQ-NEXT: orq %rbp, %rax +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $53, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $55, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $56, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $57, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: shlq $58, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: movzbl %bl, %r13d +; AVX512DQ-NEXT: movl %ebx, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rbx +; AVX512DQ-NEXT: shlq $59, %rbx +; AVX512DQ-NEXT: orq %r15, %rbx +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $60, %r15 +; AVX512DQ-NEXT: orq %rbx, %r15 +; AVX512DQ-NEXT: movq %rax, %rbx +; AVX512DQ-NEXT: shlq $61, %rbx +; AVX512DQ-NEXT: orq %r15, %rbx +; AVX512DQ-NEXT: kmovw %k4, %r15d +; AVX512DQ-NEXT: kshiftrw $14, %k2, %k4 +; AVX512DQ-NEXT: shlq $62, %rax +; AVX512DQ-NEXT: orq %rbx, %rax +; AVX512DQ-NEXT: movq %r13, %rbx +; AVX512DQ-NEXT: shlq $63, %rbx +; AVX512DQ-NEXT: orq %rax, %rbx +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $15, %k2, %k4 +; AVX512DQ-NEXT: orq %r12, %rbx +; AVX512DQ-NEXT: movq %rbx, 32(%rsi) +; AVX512DQ-NEXT: movzbl %al, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: leaq (%r13,%r13,2), %rbx +; AVX512DQ-NEXT: leaq (%rbx,%rax,4), %rbx +; AVX512DQ-NEXT: leaq (%rbx,%rax,8), %rbx +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $4, %r12 +; AVX512DQ-NEXT: orq %rbx, %r12 +; AVX512DQ-NEXT: movq %rax, %rbx +; AVX512DQ-NEXT: shlq $5, %rbx +; AVX512DQ-NEXT: orq %r12, %rbx +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $6, %r12 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $7, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $1, %k3, %k4 +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $9, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $11, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $13, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: shlq $15, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: movzbl %al, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $17, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $18, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $19, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $20, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $21, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $2, %k3, %k4 +; AVX512DQ-NEXT: shlq $22, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $23, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $24, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $25, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $26, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $27, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $28, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %eax +; AVX512DQ-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQ-NEXT: shlq $29, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %al, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $30, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $31, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $33, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $35, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k4, %ebp +; AVX512DQ-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQ-NEXT: shlq $36, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: andl $1, %ebp +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $37, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %rbp, %r12 +; AVX512DQ-NEXT: shlq $38, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $39, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %rbp, %r12 +; AVX512DQ-NEXT: shlq $40, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %rbp, %rax +; AVX512DQ-NEXT: shlq $41, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %rbp, %r13 +; AVX512DQ-NEXT: shlq $42, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k4, %r12d +; AVX512DQ-NEXT: kshiftrw $5, %k3, %k3 +; AVX512DQ-NEXT: shlq $43, %rbp +; AVX512DQ-NEXT: orq %r13, %rbp +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %rbp, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $45, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $47, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $49, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k3 +; AVX512DQ-NEXT: shlq $50, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %al, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $51, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $52, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $53, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $54, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $56, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k3 +; AVX512DQ-NEXT: shlq $57, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: andl $1, %r11d +; AVX512DQ-NEXT: movq %r11, %rax +; AVX512DQ-NEXT: shlq $58, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r11, %r12 +; AVX512DQ-NEXT: shlq $59, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r11, %rax +; AVX512DQ-NEXT: shlq $60, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r11, %r12 +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k3 +; AVX512DQ-NEXT: shlq $62, %r11 +; AVX512DQ-NEXT: orq %r12, %r11 +; AVX512DQ-NEXT: shlq $63, %r14 +; AVX512DQ-NEXT: orq %r11, %r14 +; AVX512DQ-NEXT: orq %rbx, %r14 +; AVX512DQ-NEXT: movq %r14, 40(%rsi) +; AVX512DQ-NEXT: movzbl %al, %r11d +; AVX512DQ-NEXT: movl %r11d, %ebx +; AVX512DQ-NEXT: andl $1, %ebx +; AVX512DQ-NEXT: leaq (%rbx,%rbx,2), %rax +; AVX512DQ-NEXT: leaq (%rax,%rbx,4), %rax +; AVX512DQ-NEXT: leaq (%rax,%rbx,8), %rax +; AVX512DQ-NEXT: movq %rbx, %r12 +; AVX512DQ-NEXT: shlq $4, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movzbl %bpl, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %r14 +; AVX512DQ-NEXT: shlq $5, %r14 +; AVX512DQ-NEXT: orq %r12, %r14 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $6, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $7, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $9, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $10, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k3 +; AVX512DQ-NEXT: shlq $11, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movzbl %bpl, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $13, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $15, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r13, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $17, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k3 +; AVX512DQ-NEXT: shlq $18, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movzbl %al, %r13d +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $19, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $20, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $21, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $22, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $23, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r13, %r12 +; AVX512DQ-NEXT: shlq $24, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $25, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movzbl %r15b, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $26, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $27, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $28, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %rax, %r13 +; AVX512DQ-NEXT: shlq $29, %r13 +; AVX512DQ-NEXT: orq %r12, %r13 +; AVX512DQ-NEXT: movq %rax, %r12 +; AVX512DQ-NEXT: shlq $30, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k3 +; AVX512DQ-NEXT: shlq $31, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: kmovw %k3, %r12d +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k3 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: shlq $32, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $33, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $34, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $35, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $36, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $37, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $38, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k3, %r15d +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k3 +; AVX512DQ-NEXT: shlq $39, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $40, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $41, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $42, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $43, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $44, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r13 +; AVX512DQ-NEXT: shlq $45, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k3, %r12d +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k3 +; AVX512DQ-NEXT: shlq $46, %r15 +; AVX512DQ-NEXT: orq %r13, %r15 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $47, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $48, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $49, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $50, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $51, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $52, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k3, %r15d +; AVX512DQ-NEXT: kshiftrw $10, %k1, %k3 +; AVX512DQ-NEXT: shlq $53, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $54, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $55, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $56, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $57, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $58, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $59, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $60, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movzbl %bpl, %eax +; AVX512DQ-NEXT: movl %ebp, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $61, %r13 +; AVX512DQ-NEXT: orq %r15, %r13 +; AVX512DQ-NEXT: kmovw %k3, %ebp +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k3 +; AVX512DQ-NEXT: shlq $62, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: movq %rax, %r15 +; AVX512DQ-NEXT: shlq $63, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: kmovw %k3, %r12d +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k3 +; AVX512DQ-NEXT: orq %r14, %r15 +; AVX512DQ-NEXT: movq %r15, 16(%rsi) +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: leaq (%rax,%rax,2), %r14 +; AVX512DQ-NEXT: leaq (%r14,%rax,4), %r14 +; AVX512DQ-NEXT: leaq (%r14,%rax,8), %rax +; AVX512DQ-NEXT: movzbl %r12b, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $4, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %r14 +; AVX512DQ-NEXT: shlq $5, %r14 +; AVX512DQ-NEXT: orq %r15, %r14 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $6, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $7, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $8, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $9, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k3 +; AVX512DQ-NEXT: shlq $10, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movzbl %al, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $11, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $12, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $13, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $14, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $15, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $16, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k3 +; AVX512DQ-NEXT: shlq $17, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $18, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $19, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $20, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $21, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $22, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $23, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $1, %k2, %k3 +; AVX512DQ-NEXT: shlq $24, %r12 +; AVX512DQ-NEXT: orq %r15, %r12 +; AVX512DQ-NEXT: movzbl %al, %r15d +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $25, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $26, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $27, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $28, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $29, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $30, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: shlq $31, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: kmovw %k2, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $33, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $35, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $37, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k3, %r15d +; AVX512DQ-NEXT: kshiftrw $2, %k2, %k3 +; AVX512DQ-NEXT: shlq $38, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $39, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $40, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $41, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $42, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $43, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $44, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k3, %eax +; AVX512DQ-NEXT: kshiftrw $3, %k2, %k2 +; AVX512DQ-NEXT: shlq $45, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: movzbl %al, %r12d +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $46, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $47, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $48, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r15 +; AVX512DQ-NEXT: shlq $49, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $50, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %r13 +; AVX512DQ-NEXT: shlq $51, %r13 +; AVX512DQ-NEXT: orq %rax, %r13 +; AVX512DQ-NEXT: kmovw %k2, %r15d +; AVX512DQ-NEXT: kshiftrw $14, %k1, %k1 +; AVX512DQ-NEXT: shlq $52, %r12 +; AVX512DQ-NEXT: orq %r13, %r12 +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $53, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $54, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $56, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %r12, %rax +; AVX512DQ-NEXT: movq %r15, %r12 +; AVX512DQ-NEXT: shlq $58, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k1, %r13d +; AVX512DQ-NEXT: shlq $59, %r15 +; AVX512DQ-NEXT: orq %r12, %r15 +; AVX512DQ-NEXT: andl $1, %r9d +; AVX512DQ-NEXT: movq %r9, %rax +; AVX512DQ-NEXT: shlq $60, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r9, %r12 +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %rax, %r12 +; AVX512DQ-NEXT: kmovw %k0, %r15d +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512DQ-NEXT: shlq $62, %r9 +; AVX512DQ-NEXT: orq %r12, %r9 +; AVX512DQ-NEXT: kmovw %k0, %r12d +; AVX512DQ-NEXT: shlq $63, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: orq %r14, %r10 +; AVX512DQ-NEXT: movq %r10, 24(%rsi) +; AVX512DQ-NEXT: andl $1, %r8d +; AVX512DQ-NEXT: leaq (%r8,%r8,2), %rax +; AVX512DQ-NEXT: leaq (%rax,%r8,4), %rax +; AVX512DQ-NEXT: leaq (%rax,%r8,8), %rax +; AVX512DQ-NEXT: movq %r8, %r9 +; AVX512DQ-NEXT: shlq $4, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: shlq $5, %r8 +; AVX512DQ-NEXT: orq %r9, %r8 +; AVX512DQ-NEXT: movzbl %bpl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $6, %r9 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: shlq $7, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $8, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: shlq $9, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: shlq $10, %r9 +; AVX512DQ-NEXT: orq %r10, %r9 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: shlq $11, %r10 +; AVX512DQ-NEXT: orq %r9, %r10 +; AVX512DQ-NEXT: shlq $12, %rax +; AVX512DQ-NEXT: orq %r10, %rax +; AVX512DQ-NEXT: movzbl %dil, %edi +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movq %rdi, %r9 +; AVX512DQ-NEXT: shlq $13, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $14, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %rdi, %r9 +; AVX512DQ-NEXT: shlq $15, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $16, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: movq %rdi, %r9 +; AVX512DQ-NEXT: shlq $17, %r9 +; AVX512DQ-NEXT: orq %rax, %r9 +; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: shlq $18, %rax +; AVX512DQ-NEXT: orq %r9, %rax +; AVX512DQ-NEXT: shlq $19, %rdi +; AVX512DQ-NEXT: orq %rax, %rdi +; AVX512DQ-NEXT: movzbl %dl, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $20, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $21, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $22, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $23, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $24, %rdx +; AVX512DQ-NEXT: orq %rdi, %rdx +; AVX512DQ-NEXT: movq %rax, %rdi +; AVX512DQ-NEXT: shlq $25, %rdi +; AVX512DQ-NEXT: orq %rdx, %rdi +; AVX512DQ-NEXT: shlq $26, %rax +; AVX512DQ-NEXT: orq %rdi, %rax +; AVX512DQ-NEXT: movzbl %cl, %ecx +; AVX512DQ-NEXT: andl $1, %ecx +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $27, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shlq $28, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $29, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shlq $30, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: movq %rcx, %rdx +; AVX512DQ-NEXT: shlq $31, %rdx +; AVX512DQ-NEXT: orq %rax, %rdx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shlq $32, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: shlq $33, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: andl $1, %r13d +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $34, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $35, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $36, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $37, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r13, %rax +; AVX512DQ-NEXT: shlq $38, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r13, %rcx +; AVX512DQ-NEXT: shlq $39, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $40, %r13 +; AVX512DQ-NEXT: orq %rcx, %r13 +; AVX512DQ-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $41, %rcx +; AVX512DQ-NEXT: orq %r13, %rcx +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $42, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $43, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $44, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shlq $45, %rcx +; AVX512DQ-NEXT: orq %rdx, %rcx +; AVX512DQ-NEXT: movq %rax, %rdx +; AVX512DQ-NEXT: shlq $46, %rdx +; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: shlq $47, %rax +; AVX512DQ-NEXT: orq %rdx, %rax +; AVX512DQ-NEXT: andl $1, %r15d +; AVX512DQ-NEXT: movq %r15, %rcx +; AVX512DQ-NEXT: shlq $48, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $49, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r15, %rcx +; AVX512DQ-NEXT: shlq $50, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $51, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r15, %rcx +; AVX512DQ-NEXT: shlq $52, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r15, %rax +; AVX512DQ-NEXT: shlq $53, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: shlq $54, %r15 +; AVX512DQ-NEXT: orq %rax, %r15 +; AVX512DQ-NEXT: andl $1, %r12d +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $55, %rax +; AVX512DQ-NEXT: orq %r15, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $56, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $57, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $58, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: movq %r12, %rax +; AVX512DQ-NEXT: shlq $59, %rax +; AVX512DQ-NEXT: orq %rcx, %rax +; AVX512DQ-NEXT: movq %r12, %rcx +; AVX512DQ-NEXT: shlq $60, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx +; AVX512DQ-NEXT: shlq $61, %r12 +; AVX512DQ-NEXT: orq %rcx, %r12 +; AVX512DQ-NEXT: shlq $62, %rbx +; AVX512DQ-NEXT: orq %r12, %rbx +; AVX512DQ-NEXT: shlq $63, %r11 +; AVX512DQ-NEXT: orq %rbx, %r11 +; AVX512DQ-NEXT: orq %r8, %r11 +; AVX512DQ-NEXT: movq %r11, 8(%rsi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r13 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor7_vf64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: pushq %r15 +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r13 +; AVX512BW-NEXT: pushq %r12 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: kshiftrq $13, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrq $12, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ecx +; AVX512BW-NEXT: kshiftrq $11, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edx +; AVX512BW-NEXT: kshiftrq $10, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %edi +; AVX512BW-NEXT: kshiftrq $45, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrq $37, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $59, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrq $54, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r11d +; AVX512BW-NEXT: kshiftrq $4, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $1, %k0, %k1 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $2, %k0, %k1 +; AVX512BW-NEXT: kmovd %k0, %r14d +; AVX512BW-NEXT: movzbl %r14b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: leaq (%r12,%r12,2), %r14 +; AVX512BW-NEXT: leaq (%r14,%r12,4), %r14 +; AVX512BW-NEXT: leaq (%r14,%r12,8), %r14 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $4, %r13 +; AVX512BW-NEXT: orq %r14, %r13 +; AVX512BW-NEXT: movq %r12, %r14 +; AVX512BW-NEXT: shlq $5, %r14 +; AVX512BW-NEXT: orq %r13, %r14 +; AVX512BW-NEXT: shlq $6, %r12 +; AVX512BW-NEXT: movzbl %r15b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $7, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $8, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $9, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $10, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $11, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $12, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $3, %k0, %k1 +; AVX512BW-NEXT: shlq $13, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movzbl %r13b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $14, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $15, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $16, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $17, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $18, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $19, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $55, %k0, %k1 +; AVX512BW-NEXT: shlq $20, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movzbl %r13b, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $21, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $22, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $23, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $24, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $25, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $26, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: shlq $27, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movzbl %al, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $28, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $29, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $30, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $5, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $32, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: shlq $33, %r8 +; AVX512BW-NEXT: orq %r13, %r8 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $6, %k0, %k1 +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %r8 +; AVX512BW-NEXT: shlq $35, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %r12, %r8 +; AVX512BW-NEXT: shlq $37, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %r12, %r8 +; AVX512BW-NEXT: shlq $39, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: kmovd %k1, %r8d +; AVX512BW-NEXT: kshiftrq $7, %k0, %k1 +; AVX512BW-NEXT: shlq $41, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r8, %r12 +; AVX512BW-NEXT: shlq $43, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r8, %r12 +; AVX512BW-NEXT: shlq $45, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r8, %r13 +; AVX512BW-NEXT: shlq $47, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $8, %k0, %k1 +; AVX512BW-NEXT: shlq $48, %r8 +; AVX512BW-NEXT: orq %r13, %r8 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $49, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %r12, %r8 +; AVX512BW-NEXT: shlq $50, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $51, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %r12, %r8 +; AVX512BW-NEXT: shlq $52, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $53, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %r12, %r8 +; AVX512BW-NEXT: shlq $54, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $9, %k0, %k1 +; AVX512BW-NEXT: shlq $55, %r12 +; AVX512BW-NEXT: orq %r8, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r8 +; AVX512BW-NEXT: shlq $57, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %r13, %r8 +; AVX512BW-NEXT: shlq $59, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %r8, %rax +; AVX512BW-NEXT: movq %r13, %r8 +; AVX512BW-NEXT: shlq $61, %r8 +; AVX512BW-NEXT: orq %rax, %r8 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $57, %k0, %k1 +; AVX512BW-NEXT: shlq $62, %r13 +; AVX512BW-NEXT: orq %r8, %r13 +; AVX512BW-NEXT: movzbl %al, %r8d +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: shlq $63, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %rax, (%rsi) +; AVX512BW-NEXT: movzbl %r15b, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movzbl %r11b, %r14d +; AVX512BW-NEXT: movl %r14d, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%r12,2), %rax +; AVX512BW-NEXT: leaq (%rax,%r12,4), %rax +; AVX512BW-NEXT: leaq (%rax,%r12,8), %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $4, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $5, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movzbl %r10b, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $9, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $11, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $13, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $58, %k0, %k1 +; AVX512BW-NEXT: shlq $14, %r10 +; AVX512BW-NEXT: orq %r12, %r10 +; AVX512BW-NEXT: movzbl %al, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $15, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $16, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $17, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $18, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $19, %rax +; AVX512BW-NEXT: orq %r10, %rax +; AVX512BW-NEXT: movq %r12, %r10 +; AVX512BW-NEXT: shlq $20, %r10 +; AVX512BW-NEXT: orq %rax, %r10 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $63, %k0, %k1 +; AVX512BW-NEXT: shlq $21, %r12 +; AVX512BW-NEXT: orq %r10, %r12 +; AVX512BW-NEXT: movzbl %al, %r10d +; AVX512BW-NEXT: andl $1, %r10d +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $22, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $23, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $24, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $25, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: shlq $26, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r10, %r12 +; AVX512BW-NEXT: shlq $27, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: shlq $28, %r10 +; AVX512BW-NEXT: orq %r12, %r10 +; AVX512BW-NEXT: movzbl %r9b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: shlq $29, %r12 +; AVX512BW-NEXT: orq %r10, %r12 +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $30, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: kmovd %k1, %r10d +; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %r12 +; AVX512BW-NEXT: shlq $32, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $33, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r9, %r13 +; AVX512BW-NEXT: shlq $34, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $61, %k0, %k1 +; AVX512BW-NEXT: shlq $35, %r9 +; AVX512BW-NEXT: orq %r13, %r9 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r12, %r9 +; AVX512BW-NEXT: shlq $37, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r12, %r9 +; AVX512BW-NEXT: shlq $39, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $41, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrq $62, %k0, %k1 +; AVX512BW-NEXT: shlq $42, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $43, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r9, %r12 +; AVX512BW-NEXT: shlq $44, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $45, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r9, %r12 +; AVX512BW-NEXT: shlq $46, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $47, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r9, %r13 +; AVX512BW-NEXT: shlq $48, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $36, %k0, %k1 +; AVX512BW-NEXT: shlq $49, %r9 +; AVX512BW-NEXT: orq %r13, %r9 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r12, %r9 +; AVX512BW-NEXT: shlq $51, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r12, %r9 +; AVX512BW-NEXT: shlq $53, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r12, %r9 +; AVX512BW-NEXT: shlq $55, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: shlq $56, %r12 +; AVX512BW-NEXT: orq %r9, %r12 +; AVX512BW-NEXT: movl %r10d, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $57, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r9 +; AVX512BW-NEXT: shlq $58, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $59, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movq %r13, %r9 +; AVX512BW-NEXT: shlq $60, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $61, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: kmovd %k1, %r9d +; AVX512BW-NEXT: kshiftrq $38, %k0, %k1 +; AVX512BW-NEXT: movzbl %r10b, %r10d +; AVX512BW-NEXT: shlq $62, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: shlq $63, %r10 +; AVX512BW-NEXT: orq %r13, %r10 +; AVX512BW-NEXT: orq %r15, %r10 +; AVX512BW-NEXT: movq %r10, 48(%rsi) +; AVX512BW-NEXT: movzbl %r9b, %r10d +; AVX512BW-NEXT: movl %r10d, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %r15 +; AVX512BW-NEXT: leaq (%r15,%rax,4), %rax +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: leaq (%rax,%r13,8), %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $4, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $5, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $7, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $39, %k0, %k1 +; AVX512BW-NEXT: shlq $9, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $11, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $13, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $15, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $40, %k0, %k1 +; AVX512BW-NEXT: shlq $16, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $17, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $18, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $19, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $20, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $21, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $22, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $41, %k0, %k1 +; AVX512BW-NEXT: shlq $23, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movzbl %al, %ebp +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $24, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbp, %r15 +; AVX512BW-NEXT: shlq $25, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $26, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %rbp, %r15 +; AVX512BW-NEXT: shlq $27, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $28, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %rbp, %r13 +; AVX512BW-NEXT: shlq $29, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $42, %k0, %k1 +; AVX512BW-NEXT: shlq $30, %rbp +; AVX512BW-NEXT: orq %r13, %rbp +; AVX512BW-NEXT: movzbl %r15b, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: shlq $31, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $32, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $33, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $34, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $35, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %rbp +; AVX512BW-NEXT: shlq $36, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $43, %k0, %k1 +; AVX512BW-NEXT: shlq $37, %r15 +; AVX512BW-NEXT: orq %rbp, %r15 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $39, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $41, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %rbp +; AVX512BW-NEXT: shlq $43, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $44, %k0, %k1 +; AVX512BW-NEXT: shlq $44, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $45, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $46, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $47, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $48, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $49, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %rbp +; AVX512BW-NEXT: shlq $50, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $22, %k0, %k1 +; AVX512BW-NEXT: shlq $51, %r15 +; AVX512BW-NEXT: orq %rbp, %r15 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $53, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $55, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $57, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: shlq $58, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movl %ebx, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: shlq $59, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $60, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: movq %rax, %rbp +; AVX512BW-NEXT: shlq $61, %rbp +; AVX512BW-NEXT: orq %r13, %rbp +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $46, %k0, %k1 +; AVX512BW-NEXT: movzbl %bl, %ebx +; AVX512BW-NEXT: shlq $62, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $63, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $47, %k0, %k1 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movq %r13, 32(%rsi) +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: leaq (%rbx,%rbx,2), %rax +; AVX512BW-NEXT: leaq (%rax,%r13,4), %rax +; AVX512BW-NEXT: leaq (%rax,%r13,8), %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $4, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $5, %r12 +; AVX512BW-NEXT: orq %rbx, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $7, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $48, %k0, %k1 +; AVX512BW-NEXT: shlq $8, %r13 +; AVX512BW-NEXT: orq %rbx, %r13 +; AVX512BW-NEXT: movzbl %al, %ebx +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $9, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $10, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $11, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $12, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $13, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $14, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $49, %k0, %k1 +; AVX512BW-NEXT: shlq $15, %rbx +; AVX512BW-NEXT: orq %r13, %rbx +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $17, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $18, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $19, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $20, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $21, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $50, %k0, %k1 +; AVX512BW-NEXT: shlq $22, %r13 +; AVX512BW-NEXT: orq %rbx, %r13 +; AVX512BW-NEXT: movzbl %al, %ebx +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $23, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $24, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $25, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $26, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $27, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $28, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $51, %k0, %k1 +; AVX512BW-NEXT: shlq $29, %rbx +; AVX512BW-NEXT: orq %r13, %rbx +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $31, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $33, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $35, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $52, %k0, %k1 +; AVX512BW-NEXT: shlq $36, %r13 +; AVX512BW-NEXT: orq %rbx, %r13 +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $37, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbp, %rbx +; AVX512BW-NEXT: shlq $38, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $39, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %rbp, %rbx +; AVX512BW-NEXT: shlq $40, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $41, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %rbp, %r13 +; AVX512BW-NEXT: shlq $42, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrq $53, %k0, %k1 +; AVX512BW-NEXT: shlq $43, %rbp +; AVX512BW-NEXT: orq %r13, %rbp +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $45, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $47, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: shlq $49, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $19, %k0, %k1 +; AVX512BW-NEXT: shlq $50, %rbx +; AVX512BW-NEXT: orq %r13, %rbx +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $51, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $52, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $53, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $54, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $55, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r13, %rbx +; AVX512BW-NEXT: shlq $56, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $18, %k0, %k1 +; AVX512BW-NEXT: shlq $57, %r13 +; AVX512BW-NEXT: orq %rbx, %r13 +; AVX512BW-NEXT: andl $1, %r11d +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r11, %rbx +; AVX512BW-NEXT: shlq $59, %rbx +; AVX512BW-NEXT: orq %rax, %rbx +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %rbx, %rax +; AVX512BW-NEXT: movq %r11, %r13 +; AVX512BW-NEXT: shlq $61, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebx +; AVX512BW-NEXT: kshiftrq $20, %k0, %k1 +; AVX512BW-NEXT: shlq $62, %r11 +; AVX512BW-NEXT: orq %r13, %r11 +; AVX512BW-NEXT: shlq $63, %r14 +; AVX512BW-NEXT: orq %r11, %r14 +; AVX512BW-NEXT: orq %r12, %r14 +; AVX512BW-NEXT: movq %r14, 40(%rsi) +; AVX512BW-NEXT: movzbl %bl, %r11d +; AVX512BW-NEXT: movl %r11d, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: leaq (%rax,%rax,2), %r14 +; AVX512BW-NEXT: leaq (%r14,%rax,4), %r14 +; AVX512BW-NEXT: leaq (%r14,%rax,8), %r14 +; AVX512BW-NEXT: shlq $4, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movzbl %bpl, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %r14 +; AVX512BW-NEXT: shlq $5, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $9, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $10, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $21, %k0, %k1 +; AVX512BW-NEXT: shlq $11, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movzbl %bpl, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $13, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $15, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $17, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $27, %k0, %k1 +; AVX512BW-NEXT: shlq $18, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: movzbl %al, %r13d +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $19, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $20, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $21, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $22, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $23, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $24, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: shlq $25, %r13 +; AVX512BW-NEXT: orq %r12, %r13 +; AVX512BW-NEXT: movzbl %r15b, %ebp +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $26, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %rbp, %r12 +; AVX512BW-NEXT: shlq $27, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $28, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %rbp, %r12 +; AVX512BW-NEXT: shlq $29, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $23, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $24, %k0, %k1 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: shlq $32, %r15 +; AVX512BW-NEXT: orq %rbp, %r15 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $33, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $34, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $35, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $36, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $37, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %rbp +; AVX512BW-NEXT: shlq $38, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $25, %k0, %k1 +; AVX512BW-NEXT: shlq $39, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $40, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $41, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $42, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $43, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $44, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %rbp +; AVX512BW-NEXT: shlq $45, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $26, %k0, %k1 +; AVX512BW-NEXT: shlq $46, %r15 +; AVX512BW-NEXT: orq %rbp, %r15 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $47, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $48, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $49, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %r15 +; AVX512BW-NEXT: shlq $50, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $51, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r13, %rbp +; AVX512BW-NEXT: shlq $52, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $31, %k0, %k1 +; AVX512BW-NEXT: shlq $53, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $54, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $55, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $56, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $57, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $58, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $59, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: shlq $60, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movl %r12d, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: shlq $61, %r13 +; AVX512BW-NEXT: orq %r15, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $28, %k0, %k1 +; AVX512BW-NEXT: movzbl %r12b, %r15d +; AVX512BW-NEXT: shlq $62, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $63, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $29, %k0, %k1 +; AVX512BW-NEXT: orq %r14, %r12 +; AVX512BW-NEXT: movq %r12, 16(%rsi) +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: leaq (%r15,%r15,2), %r14 +; AVX512BW-NEXT: leaq (%r14,%r15,4), %r14 +; AVX512BW-NEXT: leaq (%r14,%r15,8), %r14 +; AVX512BW-NEXT: movzbl %al, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $4, %rax +; AVX512BW-NEXT: orq %r14, %rax +; AVX512BW-NEXT: movq %r15, %r14 +; AVX512BW-NEXT: shlq $5, %r14 +; AVX512BW-NEXT: orq %rax, %r14 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $6, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $7, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $8, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $9, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $30, %k0, %k1 +; AVX512BW-NEXT: shlq $10, %r15 +; AVX512BW-NEXT: orq %r12, %r15 +; AVX512BW-NEXT: movzbl %al, %r12d +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $11, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $12, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $13, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $14, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $15, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $16, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: kmovd %k1, %eax +; AVX512BW-NEXT: kshiftrq $32, %k0, %k1 +; AVX512BW-NEXT: shlq $17, %r12 +; AVX512BW-NEXT: orq %r15, %r12 +; AVX512BW-NEXT: movzbl %al, %r15d +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $18, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $19, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $20, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r12 +; AVX512BW-NEXT: shlq $21, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $22, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $23, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $33, %k0, %k1 +; AVX512BW-NEXT: shlq $24, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: movzbl %bpl, %ebp +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $25, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %rbp, %r15 +; AVX512BW-NEXT: shlq $26, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $27, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %rbp, %r15 +; AVX512BW-NEXT: shlq $28, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $29, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %rbp, %r15 +; AVX512BW-NEXT: shlq $30, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: kmovd %k1, %r13d +; AVX512BW-NEXT: kshiftrq $34, %k0, %k1 +; AVX512BW-NEXT: shlq $31, %rbp +; AVX512BW-NEXT: orq %r15, %rbp +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $33, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $35, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %rbp +; AVX512BW-NEXT: shlq $37, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $35, %k0, %k1 +; AVX512BW-NEXT: shlq $38, %r12 +; AVX512BW-NEXT: orq %rbp, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $39, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $40, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $41, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %r12 +; AVX512BW-NEXT: shlq $42, %r12 +; AVX512BW-NEXT: orq %rax, %r12 +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $43, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rbp +; AVX512BW-NEXT: shlq $44, %rbp +; AVX512BW-NEXT: orq %rax, %rbp +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $14, %k0, %k1 +; AVX512BW-NEXT: shlq $45, %r13 +; AVX512BW-NEXT: orq %rbp, %r13 +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $46, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $47, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $49, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %r13, %rax +; AVX512BW-NEXT: movq %r15, %r13 +; AVX512BW-NEXT: shlq $51, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %ebp +; AVX512BW-NEXT: kshiftrq $15, %k0, %k1 +; AVX512BW-NEXT: shlq $52, %r15 +; AVX512BW-NEXT: orq %r13, %r15 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $53, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $54, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $55, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r15 +; AVX512BW-NEXT: shlq $56, %r15 +; AVX512BW-NEXT: orq %rax, %r15 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $57, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %r13 +; AVX512BW-NEXT: shlq $58, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r15d +; AVX512BW-NEXT: kshiftrq $16, %k0, %k1 +; AVX512BW-NEXT: shlq $59, %r12 +; AVX512BW-NEXT: orq %r13, %r12 +; AVX512BW-NEXT: andl $1, %r9d +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: shlq $60, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r9, %r13 +; AVX512BW-NEXT: shlq $61, %r13 +; AVX512BW-NEXT: orq %rax, %r13 +; AVX512BW-NEXT: kmovd %k1, %r12d +; AVX512BW-NEXT: kshiftrq $17, %k0, %k0 +; AVX512BW-NEXT: shlq $62, %r9 +; AVX512BW-NEXT: orq %r13, %r9 +; AVX512BW-NEXT: kmovd %k0, %r13d +; AVX512BW-NEXT: shlq $63, %r10 +; AVX512BW-NEXT: orq %r9, %r10 +; AVX512BW-NEXT: orq %r14, %r10 +; AVX512BW-NEXT: movq %r10, 24(%rsi) +; AVX512BW-NEXT: andl $1, %r8d +; AVX512BW-NEXT: leaq (%r8,%r8,2), %rax +; AVX512BW-NEXT: leaq (%rax,%r8,4), %rax +; AVX512BW-NEXT: leaq (%rax,%r8,8), %rax +; AVX512BW-NEXT: movq %r8, %r9 +; AVX512BW-NEXT: shlq $4, %r9 +; AVX512BW-NEXT: orq %rax, %r9 +; AVX512BW-NEXT: shlq $5, %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: movzbl %dil, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $6, %rdi +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $7, %r9 +; AVX512BW-NEXT: orq %rdi, %r9 +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $8, %rdi +; AVX512BW-NEXT: orq %r9, %rdi +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $9, %r9 +; AVX512BW-NEXT: orq %rdi, %r9 +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: shlq $10, %rdi +; AVX512BW-NEXT: orq %r9, %rdi +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: shlq $11, %r9 +; AVX512BW-NEXT: orq %rdi, %r9 +; AVX512BW-NEXT: shlq $12, %rax +; AVX512BW-NEXT: orq %r9, %rax +; AVX512BW-NEXT: movzbl %dl, %edx +; AVX512BW-NEXT: andl $1, %edx +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $13, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $14, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $15, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $16, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: shlq $17, %rdi +; AVX512BW-NEXT: orq %rax, %rdi +; AVX512BW-NEXT: movq %rdx, %rax +; AVX512BW-NEXT: shlq $18, %rax +; AVX512BW-NEXT: orq %rdi, %rax +; AVX512BW-NEXT: shlq $19, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movzbl %cl, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $20, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $21, %rdx +; AVX512BW-NEXT: orq %rcx, %rdx +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $22, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $23, %rdx +; AVX512BW-NEXT: orq %rcx, %rdx +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shlq $24, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: movq %rax, %rdx +; AVX512BW-NEXT: shlq $25, %rdx +; AVX512BW-NEXT: orq %rcx, %rdx +; AVX512BW-NEXT: shlq $26, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $27, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shlq $28, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $29, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shlq $30, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: movq %rcx, %rdx +; AVX512BW-NEXT: shlq $31, %rdx +; AVX512BW-NEXT: orq %rax, %rdx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shlq $32, %rax +; AVX512BW-NEXT: orq %rdx, %rax +; AVX512BW-NEXT: shlq $33, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: andl $1, %ebp +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $34, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %rbp, %rcx +; AVX512BW-NEXT: shlq $35, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $36, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %rbp, %rcx +; AVX512BW-NEXT: shlq $37, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: shlq $38, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %rbp, %rcx +; AVX512BW-NEXT: shlq $39, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $40, %rbp +; AVX512BW-NEXT: orq %rcx, %rbp +; AVX512BW-NEXT: andl $1, %r15d +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $41, %rax +; AVX512BW-NEXT: orq %rbp, %rax +; AVX512BW-NEXT: movq %r15, %rcx +; AVX512BW-NEXT: shlq $42, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $43, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r15, %rcx +; AVX512BW-NEXT: shlq $44, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: shlq $45, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r15, %rcx +; AVX512BW-NEXT: shlq $46, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $47, %r15 +; AVX512BW-NEXT: orq %rcx, %r15 +; AVX512BW-NEXT: andl $1, %r12d +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $48, %rax +; AVX512BW-NEXT: orq %r15, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $49, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $50, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $51, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: shlq $52, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: shlq $53, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $54, %r12 +; AVX512BW-NEXT: orq %rcx, %r12 +; AVX512BW-NEXT: andl $1, %r13d +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $55, %rax +; AVX512BW-NEXT: orq %r12, %rax +; AVX512BW-NEXT: movq %r13, %rcx +; AVX512BW-NEXT: shlq $56, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $57, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r13, %rcx +; AVX512BW-NEXT: shlq $58, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: movq %r13, %rax +; AVX512BW-NEXT: shlq $59, %rax +; AVX512BW-NEXT: orq %rcx, %rax +; AVX512BW-NEXT: movq %r13, %rcx +; AVX512BW-NEXT: shlq $60, %rcx +; AVX512BW-NEXT: orq %rax, %rcx +; AVX512BW-NEXT: shlq $61, %r13 +; AVX512BW-NEXT: orq %rcx, %r13 +; AVX512BW-NEXT: andl $1, %ebx +; AVX512BW-NEXT: shlq $62, %rbx +; AVX512BW-NEXT: orq %r13, %rbx +; AVX512BW-NEXT: shlq $63, %r11 +; AVX512BW-NEXT: orq %rbx, %r11 +; AVX512BW-NEXT: orq %r8, %r11 +; AVX512BW-NEXT: movq %r11, 8(%rsi) +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r13 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: popq %r15 +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: retq + %src.vec = load <64 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <448 x i32> + store <448 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor8_vf2(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor8_vf2: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, (%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor8_vf2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor8_vf2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovw %k0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %src.vec = load <2 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <16 x i32> + store <16 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor8_vf4(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor8_vf4: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: kmovw %k1, 2(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k0, (%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor8_vf4: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: kmovw %k1, 2(%rsi) +; AVX512DQ-NEXT: kmovw %k0, (%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor8_vf4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %src.vec = load <4 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <32 x i32> + store <32 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor8_vf8(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor8_vf8: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 +; AVX512F-ONLY-NEXT: kmovw %k3, 6(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k2, 4(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k1, 2(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k0, (%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor8_vf8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 +; AVX512DQ-NEXT: kmovw %k3, 6(%rsi) +; AVX512DQ-NEXT: kmovw %k2, 4(%rsi) +; AVX512DQ-NEXT: kmovw %k1, 2(%rsi) +; AVX512DQ-NEXT: kmovw %k0, (%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-ONLY-LABEL: mask_replication_factor8_vf8: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k0 +; AVX512BW-ONLY-NEXT: kmovq %k0, (%rsi) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor8_vf8: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k0 +; AVX512VBMI-ONLY-NEXT: kmovq %k0, (%rsi) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq + %src.vec = load <8 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <64 x i32> + store <64 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor8_vf16(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor8_vf16: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 +; AVX512F-ONLY-NEXT: kmovw %k7, 12(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k6, 14(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k5, 8(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k4, 10(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k3, 4(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k2, 6(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k1, 2(%rsi) +; AVX512F-ONLY-NEXT: kmovw %k0, (%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor8_vf16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7 +; AVX512DQ-NEXT: kmovw %k7, 12(%rsi) +; AVX512DQ-NEXT: kmovw %k6, 14(%rsi) +; AVX512DQ-NEXT: kmovw %k5, 8(%rsi) +; AVX512DQ-NEXT: kmovw %k4, 10(%rsi) +; AVX512DQ-NEXT: kmovw %k3, 4(%rsi) +; AVX512DQ-NEXT: kmovw %k2, 6(%rsi) +; AVX512DQ-NEXT: kmovw %k1, 2(%rsi) +; AVX512DQ-NEXT: kmovw %k0, (%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor8_vf16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] +; AVX512BW-NEXT: vpmovb2m %zmm1, %k0 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: kmovq %k1, 8(%rsi) +; AVX512BW-NEXT: kmovq %k0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %src.vec = load <16 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <128 x i32> + store <128 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor8_vf32(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor8_vf32: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm14 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm0 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm13, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm15, %zmm15 +; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 28(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 30(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 24(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 26(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 20(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 22(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 16(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 18(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 12(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 14(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 8(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 10(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 4(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 6(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, (%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 2(%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor8_vf32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm13, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm15, %zmm15 +; AVX512DQ-NEXT: vpmovd2m %zmm15, %k0 +; AVX512DQ-NEXT: kmovw %k0, 28(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k0 +; AVX512DQ-NEXT: kmovw %k0, 30(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm11, %k0 +; AVX512DQ-NEXT: kmovw %k0, 24(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm9, %k0 +; AVX512DQ-NEXT: kmovw %k0, 26(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k0 +; AVX512DQ-NEXT: kmovw %k0, 20(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k0 +; AVX512DQ-NEXT: kmovw %k0, 22(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512DQ-NEXT: kmovw %k0, 16(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: kmovw %k0, 18(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, 12(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm14, %k0 +; AVX512DQ-NEXT: kmovw %k0, 14(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm12, %k0 +; AVX512DQ-NEXT: kmovw %k0, 8(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm10, %k0 +; AVX512DQ-NEXT: kmovw %k0, 10(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k0 +; AVX512DQ-NEXT: kmovw %k0, 4(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm6, %k0 +; AVX512DQ-NEXT: kmovw %k0, 6(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512DQ-NEXT: kmovw %k0, 2(%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor8_vf32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovd (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpmovb2m %zmm3, %k0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3,2,3,2,3] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k3 +; AVX512BW-NEXT: kmovq %k3, 24(%rsi) +; AVX512BW-NEXT: kmovq %k2, 16(%rsi) +; AVX512BW-NEXT: kmovq %k1, 8(%rsi) +; AVX512BW-NEXT: kmovq %k0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %src.vec = load <32 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <256 x i32> + store <256 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} + +define void @mask_replication_factor8_vf64(ptr %in.vec, ptr %out.vec) nounwind { +; AVX512F-ONLY-LABEL: mask_replication_factor8_vf64: +; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm14 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm16 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm17 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm17, %zmm4 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm20 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm22 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm24 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm17, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm25 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm26 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm27 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm28 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm11, %zmm29 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm13, %zmm30 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm15, %zmm31 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm17, %zmm2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm17, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 60(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 62(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 56(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 58(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 52(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 54(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 48(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 50(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 44(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 46(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 40(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 42(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 36(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 38(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 32(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 34(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 28(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 30(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 24(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 26(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 20(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 22(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 16(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 18(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 12(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 14(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 8(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 10(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 4(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 6(%rsi) +; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, (%rsi) +; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-ONLY-NEXT: kmovw %k0, 2(%rsi) +; AVX512F-ONLY-NEXT: vzeroupper +; AVX512F-ONLY-NEXT: retq +; +; AVX512DQ-LABEL: mask_replication_factor8_vf64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm17, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm22 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm24 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm17, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm25 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm26 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm27 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm28 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm11, %zmm29 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm13, %zmm30 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm15, %zmm31 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm17, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm17, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, 60(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm15, %k0 +; AVX512DQ-NEXT: kmovw %k0, 62(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k0 +; AVX512DQ-NEXT: kmovw %k0, 56(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm11, %k0 +; AVX512DQ-NEXT: kmovw %k0, 58(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm9, %k0 +; AVX512DQ-NEXT: kmovw %k0, 52(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k0 +; AVX512DQ-NEXT: kmovw %k0, 54(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k0 +; AVX512DQ-NEXT: kmovw %k0, 48(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512DQ-NEXT: kmovw %k0, 50(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512DQ-NEXT: kmovw %k0, 44(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm31, %k0 +; AVX512DQ-NEXT: kmovw %k0, 46(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm30, %k0 +; AVX512DQ-NEXT: kmovw %k0, 40(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm29, %k0 +; AVX512DQ-NEXT: kmovw %k0, 42(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm28, %k0 +; AVX512DQ-NEXT: kmovw %k0, 36(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm27, %k0 +; AVX512DQ-NEXT: kmovw %k0, 38(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm26, %k0 +; AVX512DQ-NEXT: kmovw %k0, 32(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm25, %k0 +; AVX512DQ-NEXT: kmovw %k0, 34(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: kmovw %k0, 28(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm24, %k0 +; AVX512DQ-NEXT: kmovw %k0, 30(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm23, %k0 +; AVX512DQ-NEXT: kmovw %k0, 24(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm22, %k0 +; AVX512DQ-NEXT: kmovw %k0, 26(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm21, %k0 +; AVX512DQ-NEXT: kmovw %k0, 20(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm20, %k0 +; AVX512DQ-NEXT: kmovw %k0, 22(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm19, %k0 +; AVX512DQ-NEXT: kmovw %k0, 16(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm18, %k0 +; AVX512DQ-NEXT: kmovw %k0, 18(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k0 +; AVX512DQ-NEXT: kmovw %k0, 12(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm16, %k0 +; AVX512DQ-NEXT: kmovw %k0, 14(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm14, %k0 +; AVX512DQ-NEXT: kmovw %k0, 8(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm12, %k0 +; AVX512DQ-NEXT: kmovw %k0, 10(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm10, %k0 +; AVX512DQ-NEXT: kmovw %k0, 4(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k0 +; AVX512DQ-NEXT: kmovw %k0, 6(%rsi) +; AVX512DQ-NEXT: vpmovd2m %zmm6, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rsi) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, 2(%rsi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_replication_factor8_vf64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpmovb2m %zmm3, %k0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm4 +; AVX512BW-NEXT: vpmovb2m %zmm4, %k2 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k3 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm4 +; AVX512BW-NEXT: vpmovb2m %zmm4, %k4 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k5 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k6 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k7 +; AVX512BW-NEXT: kmovq %k7, 48(%rsi) +; AVX512BW-NEXT: kmovq %k6, 56(%rsi) +; AVX512BW-NEXT: kmovq %k5, 32(%rsi) +; AVX512BW-NEXT: kmovq %k4, 40(%rsi) +; AVX512BW-NEXT: kmovq %k3, 16(%rsi) +; AVX512BW-NEXT: kmovq %k2, 24(%rsi) +; AVX512BW-NEXT: kmovq %k1, 8(%rsi) +; AVX512BW-NEXT: kmovq %k0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %src.vec = load <64 x i1>, ptr %in.vec, align 64 + %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <512 x i32> + store <512 x i1> %tgt.mask, ptr %out.vec, align 64 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512: {{.*}} +; FALLBACK0: {{.*}} +; FALLBACK1: {{.*}} +; FALLBACK2: {{.*}} +; FALLBACK3: {{.*}} +; FALLBACK4: {{.*}} +; FALLBACK5: {{.*}} +; FALLBACK6: {{.*}} +; FALLBACK7: {{.*}}