diff --git a/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll b/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll new file mode 100644 index 00000000000000..e105d4e3f0ccce --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll @@ -0,0 +1,229 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vpopcntdq,+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK + + +define <16 x i32> @combine_mask_with_or(<16 x i32> %v0) { +; CHECK-LABEL: combine_mask_with_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: movw $2570, %ax # imm = 0xA0A +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpord %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpopcntd %zmm0, %zmm0 +; CHECK-NEXT: movb $-52, %al +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf0_0 = shufflevector <16 x i32> %v0, <16 x i32> poison, <16 x i32> + %shuf0_1 = shufflevector <16 x i32> %v0, <16 x i32> %shuf0_0, <16 x i32> + %op0_0 = or <16 x i32> %shuf0_0, %shuf0_1 + %op1_0 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %v0) + %r = shufflevector <16 x i32> %op0_0, <16 x i32> %op1_0, <16 x i32> + ret <16 x i32> %r +} + +define <16 x i32> @combine_mask_with_mul(<16 x i32> %v0) { +; CHECK-LABEL: combine_mask_with_mul: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: movw $2570, %ax # imm = 0xA0A +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpmulld %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpopcntd %zmm0, %zmm0 +; CHECK-NEXT: movb $-52, %al +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf0_0 = shufflevector <16 x i32> %v0, <16 x i32> poison, <16 x i32> + %shuf0_1 = shufflevector <16 x i32> %v0, <16 x i32> %shuf0_0, <16 x i32> + %op0_0 = mul <16 x i32> %shuf0_0, %shuf0_1 + %op1_0 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %v0) + %r = shufflevector <16 x i32> %op0_0, <16 x i32> %op1_0, <16 x i32> + ret <16 x i32> %r +} + +define <16 x i32> @combine_mask_with_abs(<16 x i32> %v0) { +; CHECK-LABEL: combine_mask_with_abs: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: vpabsd %zmm1, %zmm1 +; CHECK-NEXT: vpopcntd %zmm0, %zmm0 +; CHECK-NEXT: movb $-52, %al +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf0_0 = shufflevector <16 x i32> %v0, <16 x i32> poison, <16 x i32> + %shuf0_1 = shufflevector <16 x i32> %v0, <16 x i32> %shuf0_0, <16 x i32> + %op0_0_tmp0 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %shuf0_0, i1 true) + %op0_0_tmp1 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %shuf0_1, i1 true) + %op0_0 = shufflevector <16 x i32> %op0_0_tmp0, <16 x i32> %op0_0_tmp0, <16 x i32> + %op1_0 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %v0) + %r = shufflevector <16 x i32> %op0_0, <16 x i32> %op1_0, <16 x i32> + ret <16 x i32> %r +} + +define <16 x i32> @combine_mask_with_umin(<16 x i32> %v0) { +; CHECK-LABEL: combine_mask_with_umin: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpminud %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpopcntd %zmm0, %zmm0 +; CHECK-NEXT: movb $-52, %al +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf0_0 = shufflevector <16 x i32> %v0, <16 x i32> poison, <16 x i32> + %shuf0_1 = shufflevector <16 x i32> %v0, <16 x i32> %shuf0_0, <16 x i32> + %op0_0 = tail call <16 x i32> @llvm.umin.v16i32(<16 x i32> %shuf0_0, <16 x i32> %shuf0_1) + %op1_0 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %v0) + %r = shufflevector <16 x i32> %op0_0, <16 x i32> %op1_0, <16 x i32> + ret <16 x i32> %r +} + +define <16 x i32> @combine_mask_with_umax(<16 x i32> %v0) { +; CHECK-LABEL: combine_mask_with_umax: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpmaxud %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpopcntd %zmm0, %zmm0 +; CHECK-NEXT: movb $-52, %al +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf0_0 = shufflevector <16 x i32> %v0, <16 x i32> poison, <16 x i32> + %shuf0_1 = shufflevector <16 x i32> %v0, <16 x i32> %shuf0_0, <16 x i32> + %op0_0 = tail call <16 x i32> @llvm.umax.v16i32(<16 x i32> %shuf0_0, <16 x i32> %shuf0_1) + %op1_0 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %v0) + %r = shufflevector <16 x i32> %op0_0, <16 x i32> %op1_0, <16 x i32> + ret <16 x i32> %r +} + +define <16 x i32> @combine_mask_with_smin(<16 x i32> %v0) { +; CHECK-LABEL: combine_mask_with_smin: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpminsd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpopcntd %zmm0, %zmm0 +; CHECK-NEXT: movb $-52, %al +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf0_0 = shufflevector <16 x i32> %v0, <16 x i32> poison, <16 x i32> + %shuf0_1 = shufflevector <16 x i32> %v0, <16 x i32> %shuf0_0, <16 x i32> + %op0_0 = tail call <16 x i32> @llvm.smin.v16i32(<16 x i32> %shuf0_0, <16 x i32> %shuf0_1) + %op1_0 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %v0) + %r = shufflevector <16 x i32> %op0_0, <16 x i32> %op1_0, <16 x i32> + ret <16 x i32> %r +} + +define <16 x i32> @combine_mask_with_smax(<16 x i32> %v0) { +; CHECK-LABEL: combine_mask_with_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpopcntd %zmm0, %zmm0 +; CHECK-NEXT: movb $-52, %al +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf0_0 = shufflevector <16 x i32> %v0, <16 x i32> poison, <16 x i32> + %shuf0_1 = shufflevector <16 x i32> %v0, <16 x i32> %shuf0_0, <16 x i32> + %op0_0 = tail call <16 x i32> @llvm.smax.v16i32(<16 x i32> %shuf0_0, <16 x i32> %shuf0_1) + %op1_0 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %v0) + %r = shufflevector <16 x i32> %op0_0, <16 x i32> %op1_0, <16 x i32> + ret <16 x i32> %r +} + +define <16 x i32> @combine_mask_with_shl(<16 x i32> %v0) { +; CHECK-LABEL: combine_mask_with_shl: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: movw $2570, %ax # imm = 0xA0A +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpsllvd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpopcntd %zmm0, %zmm0 +; CHECK-NEXT: movb $-52, %al +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf0_0 = shufflevector <16 x i32> %v0, <16 x i32> poison, <16 x i32> + %shuf0_1 = shufflevector <16 x i32> %v0, <16 x i32> %shuf0_0, <16 x i32> + %op0_0 = shl <16 x i32> %shuf0_0, %shuf0_1 + %op1_0 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %v0) + %r = shufflevector <16 x i32> %op0_0, <16 x i32> %op1_0, <16 x i32> + ret <16 x i32> %r +} + +define <16 x i32> @combine_mask_with_ashr(<16 x i32> %v0) { +; CHECK-LABEL: combine_mask_with_ashr: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: movw $2570, %ax # imm = 0xA0A +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpsravd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpopcntd %zmm0, %zmm0 +; CHECK-NEXT: movb $-52, %al +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf0_0 = shufflevector <16 x i32> %v0, <16 x i32> poison, <16 x i32> + %shuf0_1 = shufflevector <16 x i32> %v0, <16 x i32> %shuf0_0, <16 x i32> + %op0_0 = ashr <16 x i32> %shuf0_0, %shuf0_1 + %op1_0 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %v0) + %r = shufflevector <16 x i32> %op0_0, <16 x i32> %op1_0, <16 x i32> + ret <16 x i32> %r +} + +define <16 x i32> @combine_mask_with_lshr(<16 x i32> %v0) { +; CHECK-LABEL: combine_mask_with_lshr: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: movw $2570, %ax # imm = 0xA0A +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpopcntd %zmm0, %zmm0 +; CHECK-NEXT: movb $-52, %al +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %shuf0_0 = shufflevector <16 x i32> %v0, <16 x i32> poison, <16 x i32> + %shuf0_1 = shufflevector <16 x i32> %v0, <16 x i32> %shuf0_0, <16 x i32> + %op0_0 = lshr <16 x i32> %shuf0_0, %shuf0_1 + %op1_0 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %v0) + %r = shufflevector <16 x i32> %op0_0, <16 x i32> %op1_0, <16 x i32> + ret <16 x i32> %r +} + +declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.umax.v16i32(<16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) +declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1)