From e4e0fcd6cd7d99cc9149917cc8c286f2dc36bf76 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 29 Sep 2025 10:01:25 +0100 Subject: [PATCH] [X86] createVPDPBUSD - only use 512-bit X86ISD::VPDPBUSD on AVX512VNNI targets Inspired by #160928 - if we have a AVX512 target capable of AVXVNNI but not AVX512VNNI then we must split 512-bit (or larger) types to 256-bits --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +- llvm/test/CodeGen/X86/dpbusd.ll | 202 ++++++++++++++---------- llvm/test/CodeGen/X86/dpbusd_const.ll | 192 ++++++++++++++-------- 3 files changed, 244 insertions(+), 156 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fcfeb661aa891..e7eb67ab12edc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4456,8 +4456,8 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, bool AllowAVX512 = true) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; - if ((CheckBWI && Subtarget.useBWIRegs()) || - (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) { + if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) || + (!CheckBWI && Subtarget.useAVX512Regs()))) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); @@ -46197,7 +46197,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, SDValue Zero = DAG.getConstant(0, DL, DpVT); return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1}, - DpBuilder, false); + DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI()); } // Create a PSADBW given two sources representable as zexts of vXi8. diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll index 3aa77c3955c63..7bd22d57347b3 100644 --- a/llvm/test/CodeGen/X86/dpbusd.ll +++ b/llvm/test/CodeGen/X86/dpbusd.ll @@ -1,40 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=AVXVNNI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=AVX512,AVX512VNNI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VLVNNI define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) { -; AVXVNNI-LABEL: no_dpbusd: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vmovd %xmm0, %eax -; AVXVNNI-NEXT: addl %edx, %eax -; AVXVNNI-NEXT: vzeroupper -; AVXVNNI-NEXT: retq -; -; AVX512-LABEL: no_dpbusd: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: addl %edx, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; CHECK-LABEL: no_dpbusd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; CHECK-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq entry: %0 = load <16 x i8>, ptr %a, align 16 %1 = zext <16 x i8> %0 to <16 x i32> @@ -99,25 +84,44 @@ entry: } define i32 @mul_zext(ptr%a, ptr%b, i32 %c, i32 %n) { -; AVXVNNI-LABEL: mul_zext: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmovsxbw (%rsi), %ymm1 -; AVXVNNI-NEXT: vpmullw %ymm0, %ymm1, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vmovd %xmm0, %eax -; AVXVNNI-NEXT: addl %edx, %eax -; AVXVNNI-NEXT: vzeroupper -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_zext: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVXVNNI-AVX-NEXT: vpmovsxbw (%rsi), %ymm1 +; AVXVNNI-AVX-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVXVNNI-AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVXVNNI-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX-NEXT: addl %edx, %eax +; AVXVNNI-AVX-NEXT: vzeroupper +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_zext: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVXVNNI-AVX512-NEXT: vpmovsxbw (%rsi), %ymm1 +; AVXVNNI-AVX512-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVXVNNI-AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVXVNNI-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVXVNNI-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVXVNNI-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX512-NEXT: addl %edx, %eax +; AVXVNNI-AVX512-NEXT: vzeroupper +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512-LABEL: mul_zext: ; AVX512: # %bb.0: # %entry @@ -153,25 +157,44 @@ entry: } define i32 @mul_sext(ptr%a, ptr%b, i32 %c, i32 %n) { -; AVXVNNI-LABEL: mul_sext: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmovsxbw (%rsi), %ymm1 -; AVXVNNI-NEXT: vpmullw %ymm0, %ymm1, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpmovsxwd %xmm1, %ymm1 -; AVXVNNI-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vmovd %xmm0, %eax -; AVXVNNI-NEXT: addl %edx, %eax -; AVXVNNI-NEXT: vzeroupper -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_sext: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVXVNNI-AVX-NEXT: vpmovsxbw (%rsi), %ymm1 +; AVXVNNI-AVX-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVXVNNI-AVX-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVXVNNI-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX-NEXT: addl %edx, %eax +; AVXVNNI-AVX-NEXT: vzeroupper +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_sext: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVXVNNI-AVX512-NEXT: vpmovsxbw (%rsi), %ymm1 +; AVXVNNI-AVX512-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVXVNNI-AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVXVNNI-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVXVNNI-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVXVNNI-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX512-NEXT: addl %edx, %eax +; AVXVNNI-AVX512-NEXT: vzeroupper +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512-LABEL: mul_sext: ; AVX512: # %bb.0: # %entry @@ -312,17 +335,30 @@ entry: declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) { -; AVXVNNI-LABEL: vpdpbusd_128: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 -; AVXVNNI-NEXT: vmovd %xmm2, %eax -; AVXVNNI-NEXT: addl %edx, %eax -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: vpdpbusd_128: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVXVNNI-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVXVNNI-AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 +; AVXVNNI-AVX-NEXT: vmovd %xmm2, %eax +; AVXVNNI-AVX-NEXT: addl %edx, %eax +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: vpdpbusd_128: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVXVNNI-AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVXVNNI-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVXVNNI-AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVXVNNI-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVXVNNI-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 +; AVXVNNI-AVX512-NEXT: vmovd %xmm2, %eax +; AVXVNNI-AVX512-NEXT: addl %edx, %eax +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512VNNI-LABEL: vpdpbusd_128: ; AVX512VNNI: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll index 456e6e8f263aa..bb47df59eefad 100644 --- a/llvm/test/CodeGen/X86/dpbusd_const.ll +++ b/llvm/test/CodeGen/X86/dpbusd_const.ll @@ -1,20 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=ALL,AVXVNNI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VNNI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VLVNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VLVNNI define i32 @mul_4xi8_zc_exceed(<4 x i8> %a, i32 %c) { -; ALL-LABEL: mul_4xi8_zc_exceed: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0] -; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; ALL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: addl %edi, %eax -; ALL-NEXT: retq +; CHECK-LABEL: mul_4xi8_zc_exceed: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: retq entry: %0 = zext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> %0, @@ -24,14 +25,24 @@ entry: } define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_zc: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_4xi8_zc: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVXVNNI-AVX-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX-NEXT: addl %edi, %eax +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_4xi8_zc: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX512-NEXT: addl %edi, %eax +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi8_zc: ; AVX512VNNI: # %bb.0: # %entry @@ -62,16 +73,26 @@ entry: } define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi4_cz: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVXVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_4xi4_cz: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVXVNNI-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVXVNNI-AVX-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX-NEXT: addl %edi, %eax +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_4xi4_cz: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpmovdb %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX512-NEXT: addl %edi, %eax +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi4_cz: ; AVX512VNNI: # %bb.0: # %entry @@ -104,15 +125,26 @@ entry: } define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_cs: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVXVNNI-NEXT: vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_4xi8_cs: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX-NEXT: vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1 +; AVXVNNI-AVX-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX-NEXT: addl %edi, %eax +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_4xi8_cs: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX512-NEXT: vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVXVNNI-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd %xmm0, %xmm1, %xmm2 +; AVXVNNI-AVX512-NEXT: vmovd %xmm2, %eax +; AVXVNNI-AVX512-NEXT: addl %edi, %eax +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi8_cs: ; AVX512VNNI: # %bb.0: # %entry @@ -145,17 +177,17 @@ entry: } define i32 @mul_4xi8_cs_exceed(<4 x i8> %a, i32 %c) { -; ALL-LABEL: mul_4xi8_cs_exceed: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vpmovsxbd %xmm0, %xmm0 -; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0] -; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; ALL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: addl %edi, %eax -; ALL-NEXT: retq +; CHECK-LABEL: mul_4xi8_cs_exceed: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 +; CHECK-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: retq entry: %0 = sext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> , %0 @@ -265,24 +297,44 @@ entry: } define i32 @mul_64xi8_zc(<64 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_64xi8_zc: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64] -; AVXVNNI-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVXVNNI-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4 -; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm0, %ymm3 -; AVXVNNI-NEXT: vpaddd %ymm4, %ymm3, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vmovd %xmm0, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: vzeroupper -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_64xi8_zc: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64] +; AVXVNNI-AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVXVNNI-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4 +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd %ymm2, %ymm0, %ymm3 +; AVXVNNI-AVX-NEXT: vpaddd %ymm4, %ymm3, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX-NEXT: addl %edi, %eax +; AVXVNNI-AVX-NEXT: vzeroupper +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_64xi8_zc: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVXVNNI-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64] +; AVXVNNI-AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVXVNNI-AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd %ymm2, %ymm0, %ymm3 +; AVXVNNI-AVX512-NEXT: vpaddd %ymm4, %ymm3, %ymm0 +; AVXVNNI-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX512-NEXT: addl %edi, %eax +; AVXVNNI-AVX512-NEXT: vzeroupper +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512-LABEL: mul_64xi8_zc: ; AVX512: # %bb.0: # %entry