diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll index 03b61d9235254..4d341f1b31027 100644 --- a/llvm/test/CodeGen/X86/shift-i512.ll +++ b/llvm/test/CodeGen/X86/shift-i512.ll @@ -1,208 +1,2050 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s -check-prefixes=AVX512VL -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+avx512vbmi2 | FileCheck %s -check-prefixes=AVX512VBMI -; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s -check-prefixes=ZNVER4 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s -check-prefixes=CHECK,SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s -check-prefixes=CHECK,SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s -check-prefixes=CHECK,AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=knl | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 -mattr=+avx512vbmi2 | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512VBMI -; i512 shifts hidden inside 512-bit vectors. +define i512 @shl_i512(i512 %a0, i512 %a1) nounwind { +; SSE-LABEL: shl_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: pushq %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %eax +; SSE-NEXT: andl $56, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: cltq +; SSE-NEXT: movq -56(%rsp,%rax), %rdx +; SSE-NEXT: movq -48(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %rsi +; SSE-NEXT: shldq %cl, %rdx, %rsi +; SSE-NEXT: movq -40(%rsp,%rax), %r10 +; SSE-NEXT: movq %r10, %r8 +; SSE-NEXT: shldq %cl, %r9, %r8 +; SSE-NEXT: movq -32(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %r11 +; SSE-NEXT: shldq %cl, %r10, %r11 +; SSE-NEXT: movq -24(%rsp,%rax), %r10 +; SSE-NEXT: movq %r10, %rbx +; SSE-NEXT: shldq %cl, %r9, %rbx +; SSE-NEXT: movq -16(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %r14 +; SSE-NEXT: shldq %cl, %r10, %r14 +; SSE-NEXT: movq -8(%rsp,%rax), %r10 +; SSE-NEXT: shldq %cl, %r9, %r10 +; SSE-NEXT: movq -64(%rsp,%rax), %rax +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: shlq %cl, %r9 +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movq %r10, 56(%rdi) +; SSE-NEXT: movq %r14, 48(%rdi) +; SSE-NEXT: movq %rbx, 40(%rdi) +; SSE-NEXT: movq %r11, 32(%rdi) +; SSE-NEXT: movq %r8, 24(%rdi) +; SSE-NEXT: movq %rsi, 16(%rdi) +; SSE-NEXT: movq %rdx, 8(%rdi) +; SSE-NEXT: movq %r9, (%rdi) +; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX2-LABEL: shl_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: andl $56, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: movslq %eax, %r8 +; AVX2-NEXT: movq -56(%rsp,%r8), %rdx +; AVX2-NEXT: movq -48(%rsp,%r8), %rax +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shldq %cl, %rdx, %rsi +; AVX2-NEXT: movq -40(%rsp,%r8), %r10 +; AVX2-NEXT: movq %r10, %r9 +; AVX2-NEXT: shldq %cl, %rax, %r9 +; AVX2-NEXT: movq -32(%rsp,%r8), %rax +; AVX2-NEXT: movq %rax, %r11 +; AVX2-NEXT: shldq %cl, %r10, %r11 +; AVX2-NEXT: movq -24(%rsp,%r8), %r10 +; AVX2-NEXT: movq %r10, %rbx +; AVX2-NEXT: shldq %cl, %rax, %rbx +; AVX2-NEXT: movq -16(%rsp,%r8), %rax +; AVX2-NEXT: movq %rax, %r14 +; AVX2-NEXT: shldq %cl, %r10, %r14 +; AVX2-NEXT: movq -8(%rsp,%r8), %r10 +; AVX2-NEXT: shldq %cl, %rax, %r10 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movq -64(%rsp,%r8), %rdi +; AVX2-NEXT: shlxq %rcx, %rdi, %r8 +; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT: shldq %cl, %rdi, %rdx +; AVX2-NEXT: movq %r10, 56(%rax) +; AVX2-NEXT: movq %r14, 48(%rax) +; AVX2-NEXT: movq %rbx, 40(%rax) +; AVX2-NEXT: movq %r11, 32(%rax) +; AVX2-NEXT: movq %r9, 24(%rax) +; AVX2-NEXT: movq %rsi, 16(%rax) +; AVX2-NEXT: movq %rdx, 8(%rax) +; AVX2-NEXT: movq %r8, (%rax) +; AVX2-NEXT: addq $8, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: shl_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: pushq %rax +; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: shrl $3, %eax +; AVX512F-NEXT: andl $56, %eax +; AVX512F-NEXT: negl %eax +; AVX512F-NEXT: movslq %eax, %r8 +; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx +; AVX512F-NEXT: movq -48(%rsp,%r8), %rax +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: shldq %cl, %rdx, %rsi +; AVX512F-NEXT: movq -40(%rsp,%r8), %r10 +; AVX512F-NEXT: movq %r10, %r9 +; AVX512F-NEXT: shldq %cl, %rax, %r9 +; AVX512F-NEXT: movq -32(%rsp,%r8), %rax +; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: shldq %cl, %r10, %r11 +; AVX512F-NEXT: movq -24(%rsp,%r8), %r10 +; AVX512F-NEXT: movq %r10, %rbx +; AVX512F-NEXT: shldq %cl, %rax, %rbx +; AVX512F-NEXT: movq -16(%rsp,%r8), %rax +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: shldq %cl, %r10, %r14 +; AVX512F-NEXT: movq -8(%rsp,%r8), %r10 +; AVX512F-NEXT: shldq %cl, %rax, %r10 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi +; AVX512F-NEXT: shlxq %rcx, %rdi, %r8 +; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512F-NEXT: shldq %cl, %rdi, %rdx +; AVX512F-NEXT: movq %r10, 56(%rax) +; AVX512F-NEXT: movq %r14, 48(%rax) +; AVX512F-NEXT: movq %rbx, 40(%rax) +; AVX512F-NEXT: movq %r11, 32(%rax) +; AVX512F-NEXT: movq %r9, 24(%rax) +; AVX512F-NEXT: movq %rsi, 16(%rax) +; AVX512F-NEXT: movq %rdx, 8(%rax) +; AVX512F-NEXT: movq %r8, (%rax) +; AVX512F-NEXT: addq $8, %rsp +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shl_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %r15 +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: andl $63, %ecx +; AVX512VL-NEXT: shrl $3, %eax +; AVX512VL-NEXT: andl $56, %eax +; AVX512VL-NEXT: negl %eax +; AVX512VL-NEXT: movslq %eax, %r9 +; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx +; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax +; AVX512VL-NEXT: movq %rax, %rsi +; AVX512VL-NEXT: shldq %cl, %rdx, %rsi +; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10 +; AVX512VL-NEXT: movq %r10, %r8 +; AVX512VL-NEXT: shldq %cl, %rax, %r8 +; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11 +; AVX512VL-NEXT: movq %r11, %rbx +; AVX512VL-NEXT: shldq %cl, %r10, %rbx +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi +; AVX512VL-NEXT: movq %rdi, %r10 +; AVX512VL-NEXT: shldq %cl, %r11, %r10 +; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11 +; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14 +; AVX512VL-NEXT: movq %r14, %r15 +; AVX512VL-NEXT: shldq %cl, %rdi, %r15 +; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi +; AVX512VL-NEXT: shldq %cl, %r14, %rdi +; AVX512VL-NEXT: shlxq %rcx, %r11, %r9 +; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512VL-NEXT: shldq %cl, %r11, %rdx +; AVX512VL-NEXT: movq %rdi, 56(%rax) +; AVX512VL-NEXT: movq %r15, 48(%rax) +; AVX512VL-NEXT: movq %r10, 40(%rax) +; AVX512VL-NEXT: movq %rbx, 32(%rax) +; AVX512VL-NEXT: movq %r8, 24(%rax) +; AVX512VL-NEXT: movq %rsi, 16(%rax) +; AVX512VL-NEXT: movq %rdx, 8(%rax) +; AVX512VL-NEXT: movq %r9, (%rax) +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: popq %r15 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512VBMI-LABEL: shl_i512: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: pushq %r15 +; AVX512VBMI-NEXT: pushq %r14 +; AVX512VBMI-NEXT: pushq %rbx +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movl %eax, %ecx +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: shrl $3, %eax +; AVX512VBMI-NEXT: andl $56, %eax +; AVX512VBMI-NEXT: negl %eax +; AVX512VBMI-NEXT: movslq %eax, %r9 +; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx +; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax +; AVX512VBMI-NEXT: movq %rax, %rsi +; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi +; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10 +; AVX512VBMI-NEXT: movq %r10, %r8 +; AVX512VBMI-NEXT: shldq %cl, %rax, %r8 +; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11 +; AVX512VBMI-NEXT: movq %r11, %rbx +; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi +; AVX512VBMI-NEXT: movq %rdi, %r10 +; AVX512VBMI-NEXT: shldq %cl, %r11, %r10 +; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11 +; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14 +; AVX512VBMI-NEXT: movq %r14, %r15 +; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15 +; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi +; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi +; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9 +; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx +; AVX512VBMI-NEXT: movq %rdi, 56(%rax) +; AVX512VBMI-NEXT: movq %r15, 48(%rax) +; AVX512VBMI-NEXT: movq %r10, 40(%rax) +; AVX512VBMI-NEXT: movq %rbx, 32(%rax) +; AVX512VBMI-NEXT: movq %r8, 24(%rax) +; AVX512VBMI-NEXT: movq %rsi, 16(%rax) +; AVX512VBMI-NEXT: movq %rdx, 8(%rax) +; AVX512VBMI-NEXT: movq %r9, (%rax) +; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: popq %r14 +; AVX512VBMI-NEXT: popq %r15 +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq + %r = shl i512 %a0, %a1 + ret i512 %r +} -define <8 x i64> @shl_i512_1(<8 x i64> %a) { -; AVX512VL-LABEL: shl_i512_1: +define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind { +; SSE-LABEL: lshr_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %eax +; SSE-NEXT: andl $56, %eax +; SSE-NEXT: movq -112(%rsp,%rax), %rdx +; SSE-NEXT: movq -120(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %rsi +; SSE-NEXT: shrdq %cl, %rdx, %rsi +; SSE-NEXT: movq -104(%rsp,%rax), %r8 +; SSE-NEXT: shrdq %cl, %r8, %rdx +; SSE-NEXT: movq -96(%rsp,%rax), %r10 +; SSE-NEXT: shrdq %cl, %r10, %r8 +; SSE-NEXT: movq -88(%rsp,%rax), %r11 +; SSE-NEXT: shrdq %cl, %r11, %r10 +; SSE-NEXT: movq -80(%rsp,%rax), %rbx +; SSE-NEXT: shrdq %cl, %rbx, %r11 +; SSE-NEXT: movq -72(%rsp,%rax), %r14 +; SSE-NEXT: shrdq %cl, %r14, %rbx +; SSE-NEXT: movq -128(%rsp,%rax), %r15 +; SSE-NEXT: shrdq %cl, %r9, %r15 +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shrq %cl, %r14 +; SSE-NEXT: movq %r14, 56(%rdi) +; SSE-NEXT: movq %rbx, 48(%rdi) +; SSE-NEXT: movq %r11, 40(%rdi) +; SSE-NEXT: movq %r10, 32(%rdi) +; SSE-NEXT: movq %r8, 24(%rdi) +; SSE-NEXT: movq %rdx, 16(%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: movq %r15, (%rdi) +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: lshr_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: andl $56, %eax +; AVX2-NEXT: movq -112(%rsp,%rax), %rdx +; AVX2-NEXT: movq -120(%rsp,%rax), %r9 +; AVX2-NEXT: movq %r9, %rsi +; AVX2-NEXT: shrdq %cl, %rdx, %rsi +; AVX2-NEXT: movq -104(%rsp,%rax), %r8 +; AVX2-NEXT: shrdq %cl, %r8, %rdx +; AVX2-NEXT: movq -96(%rsp,%rax), %r10 +; AVX2-NEXT: shrdq %cl, %r10, %r8 +; AVX2-NEXT: movq -88(%rsp,%rax), %r11 +; AVX2-NEXT: shrdq %cl, %r11, %r10 +; AVX2-NEXT: movq -80(%rsp,%rax), %rbx +; AVX2-NEXT: shrdq %cl, %rbx, %r11 +; AVX2-NEXT: movq -128(%rsp,%rax), %r14 +; AVX2-NEXT: movq -72(%rsp,%rax), %r15 +; AVX2-NEXT: shrdq %cl, %r15, %rbx +; AVX2-NEXT: shrdq %cl, %r9, %r14 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrxq %rcx, %r15, %rcx +; AVX2-NEXT: movq %rcx, 56(%rdi) +; AVX2-NEXT: movq %rbx, 48(%rdi) +; AVX2-NEXT: movq %r11, 40(%rdi) +; AVX2-NEXT: movq %r10, 32(%rdi) +; AVX2-NEXT: movq %r8, 24(%rdi) +; AVX2-NEXT: movq %rdx, 16(%rdi) +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: movq %r14, (%rdi) +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: lshr_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r15 +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: shrl $3, %eax +; AVX512F-NEXT: andl $56, %eax +; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx +; AVX512F-NEXT: movq -120(%rsp,%rax), %r9 +; AVX512F-NEXT: movq %r9, %rsi +; AVX512F-NEXT: shrdq %cl, %rdx, %rsi +; AVX512F-NEXT: movq -104(%rsp,%rax), %r8 +; AVX512F-NEXT: shrdq %cl, %r8, %rdx +; AVX512F-NEXT: movq -96(%rsp,%rax), %r10 +; AVX512F-NEXT: shrdq %cl, %r10, %r8 +; AVX512F-NEXT: movq -88(%rsp,%rax), %r11 +; AVX512F-NEXT: shrdq %cl, %r11, %r10 +; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx +; AVX512F-NEXT: shrdq %cl, %rbx, %r11 +; AVX512F-NEXT: movq -128(%rsp,%rax), %r14 +; AVX512F-NEXT: movq -72(%rsp,%rax), %r15 +; AVX512F-NEXT: shrdq %cl, %r15, %rbx +; AVX512F-NEXT: shrdq %cl, %r9, %r14 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: shrxq %rcx, %r15, %rcx +; AVX512F-NEXT: movq %rcx, 56(%rdi) +; AVX512F-NEXT: movq %rbx, 48(%rdi) +; AVX512F-NEXT: movq %r11, 40(%rdi) +; AVX512F-NEXT: movq %r10, 32(%rdi) +; AVX512F-NEXT: movq %r8, 24(%rdi) +; AVX512F-NEXT: movq %rdx, 16(%rdi) +; AVX512F-NEXT: movq %rsi, 8(%rdi) +; AVX512F-NEXT: movq %r14, (%rdi) +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: popq %r15 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: lshr_i512: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm0[3,4,5,6,7,0,1,2] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm3 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX512VL-NEXT: vpsrlq $63, %xmm4, %xmm4 -; AVX512VL-NEXT: vpaddq %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpaddq %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlq $63, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512VL-NEXT: vpsrlq $63, %zmm0, %zmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] +; AVX512VL-NEXT: pushq %r15 +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: andl $63, %ecx +; AVX512VL-NEXT: shrl $3, %eax +; AVX512VL-NEXT: andl $56, %eax +; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx +; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9 +; AVX512VL-NEXT: movq %r9, %rsi +; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi +; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8 +; AVX512VL-NEXT: shrdq %cl, %r8, %rdx +; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10 +; AVX512VL-NEXT: shrdq %cl, %r10, %r8 +; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11 +; AVX512VL-NEXT: shrdq %cl, %r11, %r10 +; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx +; AVX512VL-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14 +; AVX512VL-NEXT: shrdq %cl, %r14, %rbx +; AVX512VL-NEXT: movq -128(%rsp,%rax), %r15 +; AVX512VL-NEXT: shrdq %cl, %r9, %r15 +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx +; AVX512VL-NEXT: movq %rcx, 56(%rdi) +; AVX512VL-NEXT: movq %rbx, 48(%rdi) +; AVX512VL-NEXT: movq %r11, 40(%rdi) +; AVX512VL-NEXT: movq %r10, 32(%rdi) +; AVX512VL-NEXT: movq %r8, 24(%rdi) +; AVX512VL-NEXT: movq %rdx, 16(%rdi) +; AVX512VL-NEXT: movq %rsi, 8(%rdi) +; AVX512VL-NEXT: movq %r15, (%rdi) +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: popq %r15 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VBMI-LABEL: shl_i512_1: +; AVX512VBMI-LABEL: lshr_i512: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX512VBMI-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3 -; AVX512VBMI-NEXT: vpaddq %xmm0, %xmm0, %xmm4 -; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VBMI-NEXT: vpshldq $1, %ymm1, %ymm2, %ymm1 -; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; AVX512VBMI-NEXT: vpshldq $1, %zmm0, %zmm2, %zmm0 -; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] +; AVX512VBMI-NEXT: pushq %r15 +; AVX512VBMI-NEXT: pushq %r14 +; AVX512VBMI-NEXT: pushq %rbx +; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movl %eax, %ecx +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: shrl $3, %eax +; AVX512VBMI-NEXT: andl $56, %eax +; AVX512VBMI-NEXT: movq -112(%rsp,%rax), %rdx +; AVX512VBMI-NEXT: movq -120(%rsp,%rax), %r9 +; AVX512VBMI-NEXT: movq %r9, %rsi +; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi +; AVX512VBMI-NEXT: movq -104(%rsp,%rax), %r8 +; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx +; AVX512VBMI-NEXT: movq -96(%rsp,%rax), %r10 +; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8 +; AVX512VBMI-NEXT: movq -88(%rsp,%rax), %r11 +; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10 +; AVX512VBMI-NEXT: movq -80(%rsp,%rax), %rbx +; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VBMI-NEXT: movq -72(%rsp,%rax), %r14 +; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx +; AVX512VBMI-NEXT: movq -128(%rsp,%rax), %r15 +; AVX512VBMI-NEXT: shrdq %cl, %r9, %r15 +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx +; AVX512VBMI-NEXT: movq %rcx, 56(%rdi) +; AVX512VBMI-NEXT: movq %rbx, 48(%rdi) +; AVX512VBMI-NEXT: movq %r11, 40(%rdi) +; AVX512VBMI-NEXT: movq %r10, 32(%rdi) +; AVX512VBMI-NEXT: movq %r8, 24(%rdi) +; AVX512VBMI-NEXT: movq %rdx, 16(%rdi) +; AVX512VBMI-NEXT: movq %rsi, 8(%rdi) +; AVX512VBMI-NEXT: movq %r15, (%rdi) +; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: popq %r14 +; AVX512VBMI-NEXT: popq %r15 +; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq + %r = lshr i512 %a0, %a1 + ret i512 %r +} + +define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind { +; SSE-LABEL: ashr_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; SSE-NEXT: sarq $63, %r10 +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %eax +; SSE-NEXT: andl $56, %eax +; SSE-NEXT: movq -112(%rsp,%rax), %rdx +; SSE-NEXT: movq -120(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %rsi +; SSE-NEXT: shrdq %cl, %rdx, %rsi +; SSE-NEXT: movq -104(%rsp,%rax), %r8 +; SSE-NEXT: shrdq %cl, %r8, %rdx +; SSE-NEXT: movq -96(%rsp,%rax), %r10 +; SSE-NEXT: shrdq %cl, %r10, %r8 +; SSE-NEXT: movq -88(%rsp,%rax), %r11 +; SSE-NEXT: shrdq %cl, %r11, %r10 +; SSE-NEXT: movq -80(%rsp,%rax), %rbx +; SSE-NEXT: shrdq %cl, %rbx, %r11 +; SSE-NEXT: movq -72(%rsp,%rax), %r14 +; SSE-NEXT: shrdq %cl, %r14, %rbx +; SSE-NEXT: movq -128(%rsp,%rax), %r15 +; SSE-NEXT: shrdq %cl, %r9, %r15 +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: sarq %cl, %r14 +; SSE-NEXT: movq %r14, 56(%rdi) +; SSE-NEXT: movq %rbx, 48(%rdi) +; SSE-NEXT: movq %r11, 40(%rdi) +; SSE-NEXT: movq %r10, 32(%rdi) +; SSE-NEXT: movq %r8, 24(%rdi) +; SSE-NEXT: movq %rdx, 16(%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: movq %r15, (%rdi) +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: ashr_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: andl $56, %eax +; AVX2-NEXT: movq -112(%rsp,%rax), %rdx +; AVX2-NEXT: movq -120(%rsp,%rax), %r9 +; AVX2-NEXT: movq %r9, %rsi +; AVX2-NEXT: shrdq %cl, %rdx, %rsi +; AVX2-NEXT: movq -104(%rsp,%rax), %r8 +; AVX2-NEXT: shrdq %cl, %r8, %rdx +; AVX2-NEXT: movq -96(%rsp,%rax), %r10 +; AVX2-NEXT: shrdq %cl, %r10, %r8 +; AVX2-NEXT: movq -88(%rsp,%rax), %r11 +; AVX2-NEXT: shrdq %cl, %r11, %r10 +; AVX2-NEXT: movq -80(%rsp,%rax), %rbx +; AVX2-NEXT: shrdq %cl, %rbx, %r11 +; AVX2-NEXT: movq -128(%rsp,%rax), %r14 +; AVX2-NEXT: movq -72(%rsp,%rax), %r15 +; AVX2-NEXT: shrdq %cl, %r15, %rbx +; AVX2-NEXT: shrdq %cl, %r9, %r14 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: sarxq %rcx, %r15, %rcx +; AVX2-NEXT: movq %rcx, 56(%rdi) +; AVX2-NEXT: movq %rbx, 48(%rdi) +; AVX2-NEXT: movq %r11, 40(%rdi) +; AVX2-NEXT: movq %r10, 32(%rdi) +; AVX2-NEXT: movq %r8, 24(%rdi) +; AVX2-NEXT: movq %rdx, 16(%rdi) +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: movq %r14, (%rdi) +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: ashr_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r15 +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: sarq $63, %r10 +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: shrl $3, %eax +; AVX512F-NEXT: andl $56, %eax +; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx +; AVX512F-NEXT: movq -120(%rsp,%rax), %r9 +; AVX512F-NEXT: movq %r9, %rsi +; AVX512F-NEXT: shrdq %cl, %rdx, %rsi +; AVX512F-NEXT: movq -104(%rsp,%rax), %r8 +; AVX512F-NEXT: shrdq %cl, %r8, %rdx +; AVX512F-NEXT: movq -96(%rsp,%rax), %r10 +; AVX512F-NEXT: shrdq %cl, %r10, %r8 +; AVX512F-NEXT: movq -88(%rsp,%rax), %r11 +; AVX512F-NEXT: shrdq %cl, %r11, %r10 +; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx +; AVX512F-NEXT: shrdq %cl, %rbx, %r11 +; AVX512F-NEXT: movq -128(%rsp,%rax), %r14 +; AVX512F-NEXT: movq -72(%rsp,%rax), %r15 +; AVX512F-NEXT: shrdq %cl, %r15, %rbx +; AVX512F-NEXT: shrdq %cl, %r9, %r14 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: sarxq %rcx, %r15, %rcx +; AVX512F-NEXT: movq %rcx, 56(%rdi) +; AVX512F-NEXT: movq %rbx, 48(%rdi) +; AVX512F-NEXT: movq %r11, 40(%rdi) +; AVX512F-NEXT: movq %r10, 32(%rdi) +; AVX512F-NEXT: movq %r8, 24(%rdi) +; AVX512F-NEXT: movq %rdx, 16(%rdi) +; AVX512F-NEXT: movq %rsi, 8(%rdi) +; AVX512F-NEXT: movq %r14, (%rdi) +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: popq %r15 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: ashr_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %r15 +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: sarq $63, %r10 +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: andl $63, %ecx +; AVX512VL-NEXT: shrl $3, %eax +; AVX512VL-NEXT: andl $56, %eax +; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx +; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9 +; AVX512VL-NEXT: movq %r9, %rsi +; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi +; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8 +; AVX512VL-NEXT: shrdq %cl, %r8, %rdx +; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10 +; AVX512VL-NEXT: shrdq %cl, %r10, %r8 +; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11 +; AVX512VL-NEXT: shrdq %cl, %r11, %r10 +; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx +; AVX512VL-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14 +; AVX512VL-NEXT: shrdq %cl, %r14, %rbx +; AVX512VL-NEXT: movq -128(%rsp,%rax), %r15 +; AVX512VL-NEXT: shrdq %cl, %r9, %r15 +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx +; AVX512VL-NEXT: movq %rcx, 56(%rdi) +; AVX512VL-NEXT: movq %rbx, 48(%rdi) +; AVX512VL-NEXT: movq %r11, 40(%rdi) +; AVX512VL-NEXT: movq %r10, 32(%rdi) +; AVX512VL-NEXT: movq %r8, 24(%rdi) +; AVX512VL-NEXT: movq %rdx, 16(%rdi) +; AVX512VL-NEXT: movq %rsi, 8(%rdi) +; AVX512VL-NEXT: movq %r15, (%rdi) +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: popq %r15 +; AVX512VL-NEXT: retq ; -; ZNVER4-LABEL: shl_i512_1: -; ZNVER4: # %bb.0: -; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm2 -; ZNVER4-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; ZNVER4-NEXT: vpaddq %xmm0, %xmm0, %xmm4 -; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; ZNVER4-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3 -; ZNVER4-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; ZNVER4-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; ZNVER4-NEXT: vpshldq $1, %ymm1, %ymm2, %ymm1 -; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; ZNVER4-NEXT: vpshldq $1, %zmm0, %zmm3, %zmm0 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] -; ZNVER4-NEXT: retq - %d = bitcast <8 x i64> %a to i512 - %s = shl i512 %d, 1 - %r = bitcast i512 %s to <8 x i64> - ret <8 x i64> %r +; AVX512VBMI-LABEL: ashr_i512: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: pushq %r15 +; AVX512VBMI-NEXT: pushq %r14 +; AVX512VBMI-NEXT: pushq %rbx +; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: sarq $63, %r10 +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movl %eax, %ecx +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: shrl $3, %eax +; AVX512VBMI-NEXT: andl $56, %eax +; AVX512VBMI-NEXT: movq -112(%rsp,%rax), %rdx +; AVX512VBMI-NEXT: movq -120(%rsp,%rax), %r9 +; AVX512VBMI-NEXT: movq %r9, %rsi +; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi +; AVX512VBMI-NEXT: movq -104(%rsp,%rax), %r8 +; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx +; AVX512VBMI-NEXT: movq -96(%rsp,%rax), %r10 +; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8 +; AVX512VBMI-NEXT: movq -88(%rsp,%rax), %r11 +; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10 +; AVX512VBMI-NEXT: movq -80(%rsp,%rax), %rbx +; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VBMI-NEXT: movq -72(%rsp,%rax), %r14 +; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx +; AVX512VBMI-NEXT: movq -128(%rsp,%rax), %r15 +; AVX512VBMI-NEXT: shrdq %cl, %r9, %r15 +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx +; AVX512VBMI-NEXT: movq %rcx, 56(%rdi) +; AVX512VBMI-NEXT: movq %rbx, 48(%rdi) +; AVX512VBMI-NEXT: movq %r11, 40(%rdi) +; AVX512VBMI-NEXT: movq %r10, 32(%rdi) +; AVX512VBMI-NEXT: movq %r8, 24(%rdi) +; AVX512VBMI-NEXT: movq %rdx, 16(%rdi) +; AVX512VBMI-NEXT: movq %rsi, 8(%rdi) +; AVX512VBMI-NEXT: movq %r15, (%rdi) +; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: popq %r14 +; AVX512VBMI-NEXT: popq %r15 +; AVX512VBMI-NEXT: retq + %r = ashr i512 %a0, %a1 + ret i512 %r +} + +define i512 @shl_i512_1(i512 %a0) nounwind { +; CHECK-LABEL: shl_i512_1: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; CHECK-NEXT: shldq $1, %rdi, %r10 +; CHECK-NEXT: shldq $1, %r11, %rdi +; CHECK-NEXT: shldq $1, %r9, %r11 +; CHECK-NEXT: shldq $1, %r8, %r9 +; CHECK-NEXT: shldq $1, %rcx, %r8 +; CHECK-NEXT: shldq $1, %rdx, %rcx +; CHECK-NEXT: shldq $1, %rsi, %rdx +; CHECK-NEXT: addq %rsi, %rsi +; CHECK-NEXT: movq %r10, 56(%rax) +; CHECK-NEXT: movq %rdi, 48(%rax) +; CHECK-NEXT: movq %r11, 40(%rax) +; CHECK-NEXT: movq %r9, 32(%rax) +; CHECK-NEXT: movq %r8, 24(%rax) +; CHECK-NEXT: movq %rcx, 16(%rax) +; CHECK-NEXT: movq %rdx, 8(%rax) +; CHECK-NEXT: movq %rsi, (%rax) +; CHECK-NEXT: retq + %r = shl i512 %a0, 1 + ret i512 %r +} + +define i512 @lshr_i512_1(i512 %a0) nounwind { +; CHECK-LABEL: lshr_i512_1: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; CHECK-NEXT: shrdq $1, %rdx, %rsi +; CHECK-NEXT: shrdq $1, %rcx, %rdx +; CHECK-NEXT: shrdq $1, %r8, %rcx +; CHECK-NEXT: shrdq $1, %r9, %r8 +; CHECK-NEXT: shrdq $1, %r11, %r9 +; CHECK-NEXT: shrdq $1, %rdi, %r11 +; CHECK-NEXT: shrdq $1, %r10, %rdi +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: movq %r10, 56(%rax) +; CHECK-NEXT: movq %rdi, 48(%rax) +; CHECK-NEXT: movq %r11, 40(%rax) +; CHECK-NEXT: movq %r9, 32(%rax) +; CHECK-NEXT: movq %r8, 24(%rax) +; CHECK-NEXT: movq %rcx, 16(%rax) +; CHECK-NEXT: movq %rdx, 8(%rax) +; CHECK-NEXT: movq %rsi, (%rax) +; CHECK-NEXT: retq + %r = lshr i512 %a0, 1 + ret i512 %r +} + +define i512 @ashr_i512_1(i512 %a0) nounwind { +; CHECK-LABEL: ashr_i512_1: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; CHECK-NEXT: shrdq $1, %rdx, %rsi +; CHECK-NEXT: shrdq $1, %rcx, %rdx +; CHECK-NEXT: shrdq $1, %r8, %rcx +; CHECK-NEXT: shrdq $1, %r9, %r8 +; CHECK-NEXT: shrdq $1, %r11, %r9 +; CHECK-NEXT: shrdq $1, %rdi, %r11 +; CHECK-NEXT: shrdq $1, %r10, %rdi +; CHECK-NEXT: sarq %r10 +; CHECK-NEXT: movq %r10, 56(%rax) +; CHECK-NEXT: movq %rdi, 48(%rax) +; CHECK-NEXT: movq %r11, 40(%rax) +; CHECK-NEXT: movq %r9, 32(%rax) +; CHECK-NEXT: movq %r8, 24(%rax) +; CHECK-NEXT: movq %rcx, 16(%rax) +; CHECK-NEXT: movq %rdx, 8(%rax) +; CHECK-NEXT: movq %rsi, (%rax) +; CHECK-NEXT: retq + %r = ashr i512 %a0, 1 + ret i512 %r +} + +define i512 @shl_i512_200(i512 %a0) nounwind { +; SSE-LABEL: shl_i512_200: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: shldq $8, %r8, %r9 +; SSE-NEXT: shldq $8, %rcx, %r8 +; SSE-NEXT: shldq $8, %rdx, %rcx +; SSE-NEXT: shldq $8, %rsi, %rdx +; SSE-NEXT: shlq $8, %rsi +; SSE-NEXT: movq %r9, 56(%rdi) +; SSE-NEXT: movq %r8, 48(%rdi) +; SSE-NEXT: movq %rcx, 40(%rdi) +; SSE-NEXT: movq %rdx, 32(%rdi) +; SSE-NEXT: movq %rsi, 24(%rdi) +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, (%rdi) +; SSE-NEXT: movq $0, 16(%rdi) +; SSE-NEXT: retq +; +; AVX2-LABEL: shl_i512_200: +; AVX2: # %bb.0: +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shldq $8, %r8, %r9 +; AVX2-NEXT: shldq $8, %rcx, %r8 +; AVX2-NEXT: shldq $8, %rdx, %rcx +; AVX2-NEXT: shldq $8, %rsi, %rdx +; AVX2-NEXT: shlq $8, %rsi +; AVX2-NEXT: movq %r9, 56(%rdi) +; AVX2-NEXT: movq %r8, 48(%rdi) +; AVX2-NEXT: movq %rcx, 40(%rdi) +; AVX2-NEXT: movq %rdx, 32(%rdi) +; AVX2-NEXT: movq %rsi, 24(%rdi) +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, (%rdi) +; AVX2-NEXT: movq $0, 16(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: shl_i512_200: +; AVX512: # %bb.0: +; AVX512-NEXT: movq %rdi, %rax +; AVX512-NEXT: shldq $8, %r8, %r9 +; AVX512-NEXT: shldq $8, %rcx, %r8 +; AVX512-NEXT: shldq $8, %rdx, %rcx +; AVX512-NEXT: shldq $8, %rsi, %rdx +; AVX512-NEXT: shlq $8, %rsi +; AVX512-NEXT: movq %r9, 56(%rdi) +; AVX512-NEXT: movq %r8, 48(%rdi) +; AVX512-NEXT: movq %rcx, 40(%rdi) +; AVX512-NEXT: movq %rdx, 32(%rdi) +; AVX512-NEXT: movq %rsi, 24(%rdi) +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rdi) +; AVX512-NEXT: movq $0, 16(%rdi) +; AVX512-NEXT: retq + %r = shl i512 %a0, 200 + ret i512 %r +} + +define i512 @lshr_i512_200(i512 %a0) nounwind { +; SSE-LABEL: lshr_i512_200: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE-NEXT: shrdq $8, %r9, %r8 +; SSE-NEXT: shrdq $8, %rsi, %r9 +; SSE-NEXT: shrdq $8, %rcx, %rsi +; SSE-NEXT: shrdq $8, %rdx, %rcx +; SSE-NEXT: shrq $8, %rdx +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, 40(%rdi) +; SSE-NEXT: movq %rdx, 32(%rdi) +; SSE-NEXT: movq %rcx, 24(%rdi) +; SSE-NEXT: movq %rsi, 16(%rdi) +; SSE-NEXT: movq %r9, 8(%rdi) +; SSE-NEXT: movq %r8, (%rdi) +; SSE-NEXT: movq $0, 56(%rdi) +; SSE-NEXT: retq +; +; AVX2-LABEL: lshr_i512_200: +; AVX2: # %bb.0: +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX2-NEXT: shrdq $8, %r9, %r8 +; AVX2-NEXT: shrdq $8, %rsi, %r9 +; AVX2-NEXT: shrdq $8, %rcx, %rsi +; AVX2-NEXT: shrdq $8, %rdx, %rcx +; AVX2-NEXT: shrq $8, %rdx +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %xmm0, 40(%rdi) +; AVX2-NEXT: movq %rdx, 32(%rdi) +; AVX2-NEXT: movq %rcx, 24(%rdi) +; AVX2-NEXT: movq %rsi, 16(%rdi) +; AVX2-NEXT: movq %r9, 8(%rdi) +; AVX2-NEXT: movq %r8, (%rdi) +; AVX2-NEXT: movq $0, 56(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: lshr_i512_200: +; AVX512: # %bb.0: +; AVX512-NEXT: movq %rdi, %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512-NEXT: shrdq $8, %r9, %r8 +; AVX512-NEXT: shrdq $8, %rsi, %r9 +; AVX512-NEXT: shrdq $8, %rcx, %rsi +; AVX512-NEXT: shrdq $8, %rdx, %rcx +; AVX512-NEXT: shrq $8, %rdx +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %xmm0, 40(%rdi) +; AVX512-NEXT: movq %rdx, 32(%rdi) +; AVX512-NEXT: movq %rcx, 24(%rdi) +; AVX512-NEXT: movq %rsi, 16(%rdi) +; AVX512-NEXT: movq %r9, 8(%rdi) +; AVX512-NEXT: movq %r8, (%rdi) +; AVX512-NEXT: movq $0, 56(%rdi) +; AVX512-NEXT: retq + %r = lshr i512 %a0, 200 + ret i512 %r +} + +define i512 @ashr_i512_200(i512 %a0) nounwind { +; CHECK-LABEL: ashr_i512_200: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: shrdq $8, %r9, %r8 +; CHECK-NEXT: shrdq $8, %rsi, %r9 +; CHECK-NEXT: shrdq $8, %rcx, %rsi +; CHECK-NEXT: shrdq $8, %rdx, %rcx +; CHECK-NEXT: movq %rdx, %rdi +; CHECK-NEXT: sarq $8, %rdi +; CHECK-NEXT: sarq $63, %rdx +; CHECK-NEXT: movq %rdx, 56(%rax) +; CHECK-NEXT: movq %rdx, 48(%rax) +; CHECK-NEXT: movq %rdx, 40(%rax) +; CHECK-NEXT: movq %rdi, 32(%rax) +; CHECK-NEXT: movq %rcx, 24(%rax) +; CHECK-NEXT: movq %rsi, 16(%rax) +; CHECK-NEXT: movq %r9, 8(%rax) +; CHECK-NEXT: movq %r8, (%rax) +; CHECK-NEXT: retq + %r = ashr i512 %a0, 200 + ret i512 %r } -define <8 x i64> @lshr_i512_1(<8 x i64> %a) { -; AVX512VL-LABEL: lshr_i512_1: +define i512 @shl_i512_511(i512 %a0) nounwind { +; SSE-LABEL: shl_i512_511: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: shlq $63, %rsi +; SSE-NEXT: movq %rsi, 56(%rdi) +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, 32(%rdi) +; SSE-NEXT: movaps %xmm0, 16(%rdi) +; SSE-NEXT: movaps %xmm0, (%rdi) +; SSE-NEXT: movq $0, 48(%rdi) +; SSE-NEXT: retq +; +; AVX2-LABEL: shl_i512_511: +; AVX2: # %bb.0: +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shlq $63, %rsi +; AVX2-NEXT: movq %rsi, 56(%rdi) +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: movq $0, 48(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: shl_i512_511: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: shlq $63, %rsi +; AVX512F-NEXT: movq %rsi, 56(%rdi) +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovups %ymm0, (%rdi) +; AVX512F-NEXT: movq $0, 48(%rdi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shl_i512_511: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: shlq $63, %rsi +; AVX512VL-NEXT: movq %rsi, 56(%rdi) +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %ymm0, (%rdi) +; AVX512VL-NEXT: movq $0, 48(%rdi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512VBMI-LABEL: shl_i512_511: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: shlq $63, %rsi +; AVX512VBMI-NEXT: movq %rsi, 56(%rdi) +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %ymm0, (%rdi) +; AVX512VBMI-NEXT: movq $0, 48(%rdi) +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq + %r = shl i512 %a0, 511 + ret i512 %r +} + +define i512 @lshr_i512_511(i512 %a0) nounwind { +; SSE-LABEL: lshr_i512_511: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: shrq $63, %rcx +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, 40(%rdi) +; SSE-NEXT: movups %xmm0, 24(%rdi) +; SSE-NEXT: movups %xmm0, 8(%rdi) +; SSE-NEXT: movq %rcx, (%rdi) +; SSE-NEXT: movq $0, 56(%rdi) +; SSE-NEXT: retq +; +; AVX2-LABEL: lshr_i512_511: +; AVX2: # %bb.0: +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: shrq $63, %rcx +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %xmm0, 40(%rdi) +; AVX2-NEXT: movq %rcx, (%rdi) +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, 8(%rdi) +; AVX2-NEXT: movq $0, 56(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: lshr_i512_511: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512F-NEXT: shrq $63, %rcx +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovups %xmm0, 40(%rdi) +; AVX512F-NEXT: movq %rcx, (%rdi) +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovups %ymm0, 8(%rdi) +; AVX512F-NEXT: movq $0, 56(%rdi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: lshr_i512_511: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512VL-NEXT: vpsllq $63, %xmm3, %xmm4 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX512VL-NEXT: vpsrlq $1, %xmm5, %xmm5 -; AVX512VL-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX512VL-NEXT: vpsrlq $1, %xmm3, %xmm3 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsllq $63, %ymm1, %ymm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7] -; AVX512VL-NEXT: vpsrlq $1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512VL-NEXT: vpsrlq $1, %zmm0, %zmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %xmm0, 40(%rdi) +; AVX512VL-NEXT: shrq $63, %rcx +; AVX512VL-NEXT: movq %rcx, (%rdi) +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %ymm0, 8(%rdi) +; AVX512VL-NEXT: movq $0, 56(%rdi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VBMI-LABEL: lshr_i512_1: +; AVX512VBMI-LABEL: lshr_i512_511: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; AVX512VBMI-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512VBMI-NEXT: vpsrlq $1, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] -; AVX512VBMI-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1 -; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; AVX512VBMI-NEXT: vpshldq $63, %zmm0, %zmm2, %zmm0 -; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %xmm0, 40(%rdi) +; AVX512VBMI-NEXT: shrq $63, %rcx +; AVX512VBMI-NEXT: movq %rcx, (%rdi) +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %ymm0, 8(%rdi) +; AVX512VBMI-NEXT: movq $0, 56(%rdi) +; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq + %r = lshr i512 %a0, 511 + ret i512 %r +} + +define i512 @ashr_i512_511(i512 %a0) nounwind { +; CHECK-LABEL: ashr_i512_511: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: sarq $63, %rcx +; CHECK-NEXT: movq %rcx, 56(%rdi) +; CHECK-NEXT: movq %rcx, 48(%rdi) +; CHECK-NEXT: movq %rcx, 40(%rdi) +; CHECK-NEXT: movq %rcx, 32(%rdi) +; CHECK-NEXT: movq %rcx, 24(%rdi) +; CHECK-NEXT: movq %rcx, 16(%rdi) +; CHECK-NEXT: movq %rcx, 8(%rdi) +; CHECK-NEXT: movq %rcx, (%rdi) +; CHECK-NEXT: retq + %r = ashr i512 %a0, 511 + ret i512 %r +} + +define i512 @shl_1_i512(i512 %a0) nounwind { +; SSE-LABEL: shl_1_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: pushq %rax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: negl %esi +; SSE-NEXT: movslq %esi, %rax +; SSE-NEXT: movq -56(%rsp,%rax), %rdx +; SSE-NEXT: movq -48(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %rsi +; SSE-NEXT: shldq %cl, %rdx, %rsi +; SSE-NEXT: movq -40(%rsp,%rax), %r10 +; SSE-NEXT: movq %r10, %r8 +; SSE-NEXT: shldq %cl, %r9, %r8 +; SSE-NEXT: movq -32(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %r11 +; SSE-NEXT: shldq %cl, %r10, %r11 +; SSE-NEXT: movq -24(%rsp,%rax), %r10 +; SSE-NEXT: movq %r10, %rbx +; SSE-NEXT: shldq %cl, %r9, %rbx +; SSE-NEXT: movq -16(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %r14 +; SSE-NEXT: shldq %cl, %r10, %r14 +; SSE-NEXT: movq -8(%rsp,%rax), %r10 +; SSE-NEXT: shldq %cl, %r9, %r10 +; SSE-NEXT: movq -64(%rsp,%rax), %rax +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: shlq %cl, %r9 +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movq %r10, 56(%rdi) +; SSE-NEXT: movq %r14, 48(%rdi) +; SSE-NEXT: movq %rbx, 40(%rdi) +; SSE-NEXT: movq %r11, 32(%rdi) +; SSE-NEXT: movq %r8, 24(%rdi) +; SSE-NEXT: movq %rsi, 16(%rdi) +; SSE-NEXT: movq %rdx, 8(%rdi) +; SSE-NEXT: movq %r9, (%rdi) +; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX2-LABEL: shl_1_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: negl %esi +; AVX2-NEXT: movslq %esi, %r8 +; AVX2-NEXT: movq -56(%rsp,%r8), %rdx +; AVX2-NEXT: movq -48(%rsp,%r8), %rax +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shldq %cl, %rdx, %rsi +; AVX2-NEXT: movq -40(%rsp,%r8), %r10 +; AVX2-NEXT: movq %r10, %r9 +; AVX2-NEXT: shldq %cl, %rax, %r9 +; AVX2-NEXT: movq -32(%rsp,%r8), %rax +; AVX2-NEXT: movq %rax, %r11 +; AVX2-NEXT: shldq %cl, %r10, %r11 +; AVX2-NEXT: movq -24(%rsp,%r8), %r10 +; AVX2-NEXT: movq %r10, %rbx +; AVX2-NEXT: shldq %cl, %rax, %rbx +; AVX2-NEXT: movq -16(%rsp,%r8), %rax +; AVX2-NEXT: movq %rax, %r14 +; AVX2-NEXT: shldq %cl, %r10, %r14 +; AVX2-NEXT: movq -8(%rsp,%r8), %r10 +; AVX2-NEXT: shldq %cl, %rax, %r10 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movq -64(%rsp,%r8), %rdi +; AVX2-NEXT: shlxq %rcx, %rdi, %r8 +; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT: shldq %cl, %rdi, %rdx +; AVX2-NEXT: movq %r10, 56(%rax) +; AVX2-NEXT: movq %r14, 48(%rax) +; AVX2-NEXT: movq %rbx, 40(%rax) +; AVX2-NEXT: movq %r11, 32(%rax) +; AVX2-NEXT: movq %r9, 24(%rax) +; AVX2-NEXT: movq %rsi, 16(%rax) +; AVX2-NEXT: movq %rdx, 8(%rax) +; AVX2-NEXT: movq %r8, (%rax) +; AVX2-NEXT: addq $8, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; -; ZNVER4-LABEL: lshr_i512_1: -; ZNVER4: # %bb.0: -; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1 -; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] -; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4 -; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1 -; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; ZNVER4-NEXT: vpsrlq $1, %xmm2, %xmm2 -; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0 -; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; ZNVER4-NEXT: retq - %d = bitcast <8 x i64> %a to i512 - %s = lshr i512 %d, 1 - %r = bitcast i512 %s to <8 x i64> - ret <8 x i64> %r +; AVX512F-LABEL: shl_1_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: pushq %rax +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %esi, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: shrl $3, %esi +; AVX512F-NEXT: andl $56, %esi +; AVX512F-NEXT: negl %esi +; AVX512F-NEXT: movslq %esi, %r8 +; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx +; AVX512F-NEXT: movq -48(%rsp,%r8), %rax +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: shldq %cl, %rdx, %rsi +; AVX512F-NEXT: movq -40(%rsp,%r8), %r10 +; AVX512F-NEXT: movq %r10, %r9 +; AVX512F-NEXT: shldq %cl, %rax, %r9 +; AVX512F-NEXT: movq -32(%rsp,%r8), %rax +; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: shldq %cl, %r10, %r11 +; AVX512F-NEXT: movq -24(%rsp,%r8), %r10 +; AVX512F-NEXT: movq %r10, %rbx +; AVX512F-NEXT: shldq %cl, %rax, %rbx +; AVX512F-NEXT: movq -16(%rsp,%r8), %rax +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: shldq %cl, %r10, %r14 +; AVX512F-NEXT: movq -8(%rsp,%r8), %r10 +; AVX512F-NEXT: shldq %cl, %rax, %r10 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi +; AVX512F-NEXT: shlxq %rcx, %rdi, %r8 +; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512F-NEXT: shldq %cl, %rdi, %rdx +; AVX512F-NEXT: movq %r10, 56(%rax) +; AVX512F-NEXT: movq %r14, 48(%rax) +; AVX512F-NEXT: movq %rbx, 40(%rax) +; AVX512F-NEXT: movq %r11, 32(%rax) +; AVX512F-NEXT: movq %r9, 24(%rax) +; AVX512F-NEXT: movq %rsi, 16(%rax) +; AVX512F-NEXT: movq %rdx, 8(%rax) +; AVX512F-NEXT: movq %r8, (%rax) +; AVX512F-NEXT: addq $8, %rsp +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shl_1_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %r15 +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %esi, %ecx +; AVX512VL-NEXT: andl $63, %ecx +; AVX512VL-NEXT: shrl $3, %esi +; AVX512VL-NEXT: andl $56, %esi +; AVX512VL-NEXT: negl %esi +; AVX512VL-NEXT: movslq %esi, %r9 +; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx +; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax +; AVX512VL-NEXT: movq %rax, %rsi +; AVX512VL-NEXT: shldq %cl, %rdx, %rsi +; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10 +; AVX512VL-NEXT: movq %r10, %r8 +; AVX512VL-NEXT: shldq %cl, %rax, %r8 +; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11 +; AVX512VL-NEXT: movq %r11, %rbx +; AVX512VL-NEXT: shldq %cl, %r10, %rbx +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi +; AVX512VL-NEXT: movq %rdi, %r10 +; AVX512VL-NEXT: shldq %cl, %r11, %r10 +; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11 +; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14 +; AVX512VL-NEXT: movq %r14, %r15 +; AVX512VL-NEXT: shldq %cl, %rdi, %r15 +; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi +; AVX512VL-NEXT: shldq %cl, %r14, %rdi +; AVX512VL-NEXT: shlxq %rcx, %r11, %r9 +; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512VL-NEXT: shldq %cl, %r11, %rdx +; AVX512VL-NEXT: movq %rdi, 56(%rax) +; AVX512VL-NEXT: movq %r15, 48(%rax) +; AVX512VL-NEXT: movq %r10, 40(%rax) +; AVX512VL-NEXT: movq %rbx, 32(%rax) +; AVX512VL-NEXT: movq %r8, 24(%rax) +; AVX512VL-NEXT: movq %rsi, 16(%rax) +; AVX512VL-NEXT: movq %rdx, 8(%rax) +; AVX512VL-NEXT: movq %r9, (%rax) +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: popq %r15 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512VBMI-LABEL: shl_1_i512: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: pushq %r15 +; AVX512VBMI-NEXT: pushq %r14 +; AVX512VBMI-NEXT: pushq %rbx +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movl %esi, %ecx +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: shrl $3, %esi +; AVX512VBMI-NEXT: andl $56, %esi +; AVX512VBMI-NEXT: negl %esi +; AVX512VBMI-NEXT: movslq %esi, %r9 +; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx +; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax +; AVX512VBMI-NEXT: movq %rax, %rsi +; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi +; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10 +; AVX512VBMI-NEXT: movq %r10, %r8 +; AVX512VBMI-NEXT: shldq %cl, %rax, %r8 +; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11 +; AVX512VBMI-NEXT: movq %r11, %rbx +; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi +; AVX512VBMI-NEXT: movq %rdi, %r10 +; AVX512VBMI-NEXT: shldq %cl, %r11, %r10 +; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11 +; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14 +; AVX512VBMI-NEXT: movq %r14, %r15 +; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15 +; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi +; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi +; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9 +; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx +; AVX512VBMI-NEXT: movq %rdi, 56(%rax) +; AVX512VBMI-NEXT: movq %r15, 48(%rax) +; AVX512VBMI-NEXT: movq %r10, 40(%rax) +; AVX512VBMI-NEXT: movq %rbx, 32(%rax) +; AVX512VBMI-NEXT: movq %r8, 24(%rax) +; AVX512VBMI-NEXT: movq %rsi, 16(%rax) +; AVX512VBMI-NEXT: movq %rdx, 8(%rax) +; AVX512VBMI-NEXT: movq %r9, (%rax) +; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: popq %r14 +; AVX512VBMI-NEXT: popq %r15 +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq + %r = shl i512 1, %a0 + ret i512 %r } -define <8 x i64> @ashr_i512_1(<8 x i64> %a) { -; AVX512VL-LABEL: ashr_i512_1: +define i512 @lshr_signbit_i512(i512 %a0) nounwind { +; SSE-LABEL: lshr_signbit_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: pushq %rax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: movq -112(%rsp,%rsi), %rdx +; SSE-NEXT: movq -120(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, %r8 +; SSE-NEXT: shrdq %cl, %rdx, %r8 +; SSE-NEXT: movq -104(%rsp,%rsi), %r9 +; SSE-NEXT: shrdq %cl, %r9, %rdx +; SSE-NEXT: movq -96(%rsp,%rsi), %r10 +; SSE-NEXT: shrdq %cl, %r10, %r9 +; SSE-NEXT: movq -88(%rsp,%rsi), %r11 +; SSE-NEXT: shrdq %cl, %r11, %r10 +; SSE-NEXT: movq -80(%rsp,%rsi), %rbx +; SSE-NEXT: shrdq %cl, %rbx, %r11 +; SSE-NEXT: movq -72(%rsp,%rsi), %r14 +; SSE-NEXT: shrdq %cl, %r14, %rbx +; SSE-NEXT: movq -128(%rsp,%rsi), %rsi +; SSE-NEXT: shrdq %cl, %rax, %rsi +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shrq %cl, %r14 +; SSE-NEXT: movq %r14, 56(%rdi) +; SSE-NEXT: movq %rbx, 48(%rdi) +; SSE-NEXT: movq %r11, 40(%rdi) +; SSE-NEXT: movq %r10, 32(%rdi) +; SSE-NEXT: movq %r9, 24(%rdi) +; SSE-NEXT: movq %rdx, 16(%rdi) +; SSE-NEXT: movq %r8, 8(%rdi) +; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX2-LABEL: lshr_signbit_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808] +; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX2-NEXT: movq -120(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: shrdq %cl, %rdx, %r8 +; AVX2-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX2-NEXT: shrdq %cl, %r9, %rdx +; AVX2-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX2-NEXT: shrdq %cl, %r10, %r9 +; AVX2-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX2-NEXT: shrdq %cl, %r11, %r10 +; AVX2-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX2-NEXT: shrdq %cl, %rbx, %r11 +; AVX2-NEXT: movq -128(%rsp,%rsi), %r14 +; AVX2-NEXT: movq -72(%rsp,%rsi), %rsi +; AVX2-NEXT: shrdq %cl, %rsi, %rbx +; AVX2-NEXT: shrdq %cl, %rax, %r14 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrxq %rcx, %rsi, %rcx +; AVX2-NEXT: movq %rcx, 56(%rdi) +; AVX2-NEXT: movq %rbx, 48(%rdi) +; AVX2-NEXT: movq %r11, 40(%rdi) +; AVX2-NEXT: movq %r10, 32(%rdi) +; AVX2-NEXT: movq %r9, 24(%rdi) +; AVX2-NEXT: movq %rdx, 16(%rdi) +; AVX2-NEXT: movq %r8, 8(%rdi) +; AVX2-NEXT: movq %r14, (%rdi) +; AVX2-NEXT: addq $8, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: lshr_signbit_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: pushq %rax +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808] +; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %esi, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: shrl $3, %esi +; AVX512F-NEXT: andl $56, %esi +; AVX512F-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX512F-NEXT: movq -120(%rsp,%rsi), %rax +; AVX512F-NEXT: movq %rax, %r8 +; AVX512F-NEXT: shrdq %cl, %rdx, %r8 +; AVX512F-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX512F-NEXT: shrdq %cl, %r9, %rdx +; AVX512F-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX512F-NEXT: shrdq %cl, %r10, %r9 +; AVX512F-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX512F-NEXT: shrdq %cl, %r11, %r10 +; AVX512F-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX512F-NEXT: shrdq %cl, %rbx, %r11 +; AVX512F-NEXT: movq -128(%rsp,%rsi), %r14 +; AVX512F-NEXT: movq -72(%rsp,%rsi), %rsi +; AVX512F-NEXT: shrdq %cl, %rsi, %rbx +; AVX512F-NEXT: shrdq %cl, %rax, %r14 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: shrxq %rcx, %rsi, %rcx +; AVX512F-NEXT: movq %rcx, 56(%rdi) +; AVX512F-NEXT: movq %rbx, 48(%rdi) +; AVX512F-NEXT: movq %r11, 40(%rdi) +; AVX512F-NEXT: movq %r10, 32(%rdi) +; AVX512F-NEXT: movq %r9, 24(%rdi) +; AVX512F-NEXT: movq %rdx, 16(%rdi) +; AVX512F-NEXT: movq %r8, 8(%rdi) +; AVX512F-NEXT: movq %r14, (%rdi) +; AVX512F-NEXT: addq $8, %rsp +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: lshr_signbit_i512: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512VL-NEXT: vpsllq $63, %xmm3, %xmm4 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX512VL-NEXT: vpsrlq $1, %xmm5, %xmm5 -; AVX512VL-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX512VL-NEXT: vpsraq $1, %xmm3, %xmm3 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsllq $63, %ymm1, %ymm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7] -; AVX512VL-NEXT: vpsrlq $1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512VL-NEXT: vpsrlq $1, %zmm0, %zmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: pushq %rax +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808] +; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %esi, %ecx +; AVX512VL-NEXT: andl $63, %ecx +; AVX512VL-NEXT: shrl $3, %esi +; AVX512VL-NEXT: andl $56, %esi +; AVX512VL-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX512VL-NEXT: movq -120(%rsp,%rsi), %rax +; AVX512VL-NEXT: movq %rax, %r8 +; AVX512VL-NEXT: shrdq %cl, %rdx, %r8 +; AVX512VL-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX512VL-NEXT: shrdq %cl, %r9, %rdx +; AVX512VL-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX512VL-NEXT: shrdq %cl, %r10, %r9 +; AVX512VL-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX512VL-NEXT: shrdq %cl, %r11, %r10 +; AVX512VL-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX512VL-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VL-NEXT: movq -72(%rsp,%rsi), %r14 +; AVX512VL-NEXT: shrdq %cl, %r14, %rbx +; AVX512VL-NEXT: movq -128(%rsp,%rsi), %rsi +; AVX512VL-NEXT: shrdq %cl, %rax, %rsi +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx +; AVX512VL-NEXT: movq %rcx, 56(%rdi) +; AVX512VL-NEXT: movq %rbx, 48(%rdi) +; AVX512VL-NEXT: movq %r11, 40(%rdi) +; AVX512VL-NEXT: movq %r10, 32(%rdi) +; AVX512VL-NEXT: movq %r9, 24(%rdi) +; AVX512VL-NEXT: movq %rdx, 16(%rdi) +; AVX512VL-NEXT: movq %r8, 8(%rdi) +; AVX512VL-NEXT: movq %rsi, (%rdi) +; AVX512VL-NEXT: addq $8, %rsp +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VBMI-LABEL: ashr_i512_1: +; AVX512VBMI-LABEL: lshr_signbit_i512: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; AVX512VBMI-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512VBMI-NEXT: vpsraq $1, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] -; AVX512VBMI-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1 -; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; AVX512VBMI-NEXT: vpshldq $63, %zmm0, %zmm2, %zmm0 -; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512VBMI-NEXT: pushq %r14 +; AVX512VBMI-NEXT: pushq %rbx +; AVX512VBMI-NEXT: pushq %rax +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808] +; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movl %esi, %ecx +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: shrl $3, %esi +; AVX512VBMI-NEXT: andl $56, %esi +; AVX512VBMI-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX512VBMI-NEXT: movq -120(%rsp,%rsi), %rax +; AVX512VBMI-NEXT: movq %rax, %r8 +; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8 +; AVX512VBMI-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx +; AVX512VBMI-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9 +; AVX512VBMI-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10 +; AVX512VBMI-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VBMI-NEXT: movq -72(%rsp,%rsi), %r14 +; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx +; AVX512VBMI-NEXT: movq -128(%rsp,%rsi), %rsi +; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx +; AVX512VBMI-NEXT: movq %rcx, 56(%rdi) +; AVX512VBMI-NEXT: movq %rbx, 48(%rdi) +; AVX512VBMI-NEXT: movq %r11, 40(%rdi) +; AVX512VBMI-NEXT: movq %r10, 32(%rdi) +; AVX512VBMI-NEXT: movq %r9, 24(%rdi) +; AVX512VBMI-NEXT: movq %rdx, 16(%rdi) +; AVX512VBMI-NEXT: movq %r8, 8(%rdi) +; AVX512VBMI-NEXT: movq %rsi, (%rdi) +; AVX512VBMI-NEXT: addq $8, %rsp +; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: popq %r14 +; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq + %s = shl i512 1, 511 + %r = lshr i512 %s, %a0 + ret i512 %r +} + +define i512 @ashr_signbit_i512(i512 %a0) nounwind { +; SSE-LABEL: ashr_signbit_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: pushq %rax +; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: movq -112(%rsp,%rsi), %rdx +; SSE-NEXT: movq -120(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, %r8 +; SSE-NEXT: shrdq %cl, %rdx, %r8 +; SSE-NEXT: movq -104(%rsp,%rsi), %r9 +; SSE-NEXT: shrdq %cl, %r9, %rdx +; SSE-NEXT: movq -96(%rsp,%rsi), %r10 +; SSE-NEXT: shrdq %cl, %r10, %r9 +; SSE-NEXT: movq -88(%rsp,%rsi), %r11 +; SSE-NEXT: shrdq %cl, %r11, %r10 +; SSE-NEXT: movq -80(%rsp,%rsi), %rbx +; SSE-NEXT: shrdq %cl, %rbx, %r11 +; SSE-NEXT: movq -72(%rsp,%rsi), %r14 +; SSE-NEXT: shrdq %cl, %r14, %rbx +; SSE-NEXT: movq -128(%rsp,%rsi), %rsi +; SSE-NEXT: shrdq %cl, %rax, %rsi +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: sarq %cl, %r14 +; SSE-NEXT: movq %r14, 56(%rdi) +; SSE-NEXT: movq %rbx, 48(%rdi) +; SSE-NEXT: movq %r11, 40(%rdi) +; SSE-NEXT: movq %r10, 32(%rdi) +; SSE-NEXT: movq %r9, 24(%rdi) +; SSE-NEXT: movq %rdx, 16(%rdi) +; SSE-NEXT: movq %r8, 8(%rdi) +; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX2-LABEL: ashr_signbit_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808] +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX2-NEXT: movq -120(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: shrdq %cl, %rdx, %r8 +; AVX2-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX2-NEXT: shrdq %cl, %r9, %rdx +; AVX2-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX2-NEXT: shrdq %cl, %r10, %r9 +; AVX2-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX2-NEXT: shrdq %cl, %r11, %r10 +; AVX2-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX2-NEXT: shrdq %cl, %rbx, %r11 +; AVX2-NEXT: movq -128(%rsp,%rsi), %r14 +; AVX2-NEXT: movq -72(%rsp,%rsi), %rsi +; AVX2-NEXT: shrdq %cl, %rsi, %rbx +; AVX2-NEXT: shrdq %cl, %rax, %r14 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: sarxq %rcx, %rsi, %rcx +; AVX2-NEXT: movq %rcx, 56(%rdi) +; AVX2-NEXT: movq %rbx, 48(%rdi) +; AVX2-NEXT: movq %r11, 40(%rdi) +; AVX2-NEXT: movq %r10, 32(%rdi) +; AVX2-NEXT: movq %r9, 24(%rdi) +; AVX2-NEXT: movq %rdx, 16(%rdi) +; AVX2-NEXT: movq %r8, 8(%rdi) +; AVX2-NEXT: movq %r14, (%rdi) +; AVX2-NEXT: addq $8, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; -; ZNVER4-LABEL: ashr_i512_1: -; ZNVER4: # %bb.0: -; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1 -; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] -; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4 -; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1 -; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; ZNVER4-NEXT: vpsraq $1, %xmm2, %xmm2 -; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0 -; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; ZNVER4-NEXT: retq - %d = bitcast <8 x i64> %a to i512 - %s = ashr i512 %d, 1 - %r = bitcast i512 %s to <8 x i64> - ret <8 x i64> %r +; AVX512F-LABEL: ashr_signbit_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: pushq %rax +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = -1 +; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808] +; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %esi, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: shrl $3, %esi +; AVX512F-NEXT: andl $56, %esi +; AVX512F-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX512F-NEXT: movq -120(%rsp,%rsi), %rax +; AVX512F-NEXT: movq %rax, %r8 +; AVX512F-NEXT: shrdq %cl, %rdx, %r8 +; AVX512F-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX512F-NEXT: shrdq %cl, %r9, %rdx +; AVX512F-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX512F-NEXT: shrdq %cl, %r10, %r9 +; AVX512F-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX512F-NEXT: shrdq %cl, %r11, %r10 +; AVX512F-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX512F-NEXT: shrdq %cl, %rbx, %r11 +; AVX512F-NEXT: movq -128(%rsp,%rsi), %r14 +; AVX512F-NEXT: movq -72(%rsp,%rsi), %rsi +; AVX512F-NEXT: shrdq %cl, %rsi, %rbx +; AVX512F-NEXT: shrdq %cl, %rax, %r14 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: sarxq %rcx, %rsi, %rcx +; AVX512F-NEXT: movq %rcx, 56(%rdi) +; AVX512F-NEXT: movq %rbx, 48(%rdi) +; AVX512F-NEXT: movq %r11, 40(%rdi) +; AVX512F-NEXT: movq %r10, 32(%rdi) +; AVX512F-NEXT: movq %r9, 24(%rdi) +; AVX512F-NEXT: movq %rdx, 16(%rdi) +; AVX512F-NEXT: movq %r8, 8(%rdi) +; AVX512F-NEXT: movq %r14, (%rdi) +; AVX512F-NEXT: addq $8, %rsp +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: ashr_signbit_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: pushq %rax +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808] +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %esi, %ecx +; AVX512VL-NEXT: andl $63, %ecx +; AVX512VL-NEXT: shrl $3, %esi +; AVX512VL-NEXT: andl $56, %esi +; AVX512VL-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX512VL-NEXT: movq -120(%rsp,%rsi), %rax +; AVX512VL-NEXT: movq %rax, %r8 +; AVX512VL-NEXT: shrdq %cl, %rdx, %r8 +; AVX512VL-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX512VL-NEXT: shrdq %cl, %r9, %rdx +; AVX512VL-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX512VL-NEXT: shrdq %cl, %r10, %r9 +; AVX512VL-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX512VL-NEXT: shrdq %cl, %r11, %r10 +; AVX512VL-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX512VL-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VL-NEXT: movq -72(%rsp,%rsi), %r14 +; AVX512VL-NEXT: shrdq %cl, %r14, %rbx +; AVX512VL-NEXT: movq -128(%rsp,%rsi), %rsi +; AVX512VL-NEXT: shrdq %cl, %rax, %rsi +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx +; AVX512VL-NEXT: movq %rcx, 56(%rdi) +; AVX512VL-NEXT: movq %rbx, 48(%rdi) +; AVX512VL-NEXT: movq %r11, 40(%rdi) +; AVX512VL-NEXT: movq %r10, 32(%rdi) +; AVX512VL-NEXT: movq %r9, 24(%rdi) +; AVX512VL-NEXT: movq %rdx, 16(%rdi) +; AVX512VL-NEXT: movq %r8, 8(%rdi) +; AVX512VL-NEXT: movq %rsi, (%rdi) +; AVX512VL-NEXT: addq $8, %rsp +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512VBMI-LABEL: ashr_signbit_i512: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: pushq %r14 +; AVX512VBMI-NEXT: pushq %rbx +; AVX512VBMI-NEXT: pushq %rax +; AVX512VBMI-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808] +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movl %esi, %ecx +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: shrl $3, %esi +; AVX512VBMI-NEXT: andl $56, %esi +; AVX512VBMI-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX512VBMI-NEXT: movq -120(%rsp,%rsi), %rax +; AVX512VBMI-NEXT: movq %rax, %r8 +; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8 +; AVX512VBMI-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx +; AVX512VBMI-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9 +; AVX512VBMI-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10 +; AVX512VBMI-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VBMI-NEXT: movq -72(%rsp,%rsi), %r14 +; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx +; AVX512VBMI-NEXT: movq -128(%rsp,%rsi), %rsi +; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx +; AVX512VBMI-NEXT: movq %rcx, 56(%rdi) +; AVX512VBMI-NEXT: movq %rbx, 48(%rdi) +; AVX512VBMI-NEXT: movq %r11, 40(%rdi) +; AVX512VBMI-NEXT: movq %r10, 32(%rdi) +; AVX512VBMI-NEXT: movq %r9, 24(%rdi) +; AVX512VBMI-NEXT: movq %rdx, 16(%rdi) +; AVX512VBMI-NEXT: movq %r8, 8(%rdi) +; AVX512VBMI-NEXT: movq %rsi, (%rdi) +; AVX512VBMI-NEXT: addq $8, %rsp +; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: popq %r14 +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq + %s = shl i512 1, 511 + %r = ashr i512 %s, %a0 + ret i512 %r }