diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index c0962236f93dd..65b602801b365 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -1,36 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK0 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK1 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK2 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK3 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK4 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK5 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK6 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK7 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK8 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK9 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK10 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK11 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK12 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK13 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK14 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK15 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK16 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK17 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK18 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK19 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK20 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK21 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK22 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK23 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK24 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK25 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK26 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK27 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK28 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK29 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK30 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK31 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-AVX,X64-NO-SHLD-NO-BMI2-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-HAVE-SHLD-NO-BMI2-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-NO-SHLD-HAVE-BMI2-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-SHLD-HAVE-BMI2-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-AVX,X64-NO-SHLD-NO-BMI2-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-HAVE-SHLD-NO-BMI2-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-NO-SHLD-HAVE-BMI2-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-SHLD-HAVE-BMI2-AVX512 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-SSE2 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-SSE2 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-SSE2 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-SSE2 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-SSE4 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-SSE4 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-SSE4 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-SSE4 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-AVX,X86-NO-SHLD-NO-BMI2-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-HAVE-SHLD-NO-BMI2-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-NO-SHLD-HAVE-BMI2-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-SHLD-HAVE-BMI2-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-AVX,X86-NO-SHLD-NO-BMI2-AVX512 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-HAVE-SHLD-NO-BMI2-AVX512 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-NO-SHLD-HAVE-BMI2-AVX512 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-SHLD-HAVE-BMI2-AVX512 define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: lshr_4bytes: @@ -646,787 +646,596 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; -; FALLBACK16-LABEL: lshr_16bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $60, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl (%ecx), %edx -; FALLBACK16-NEXT: movl 4(%ecx), %esi -; FALLBACK16-NEXT: movl 8(%ecx), %edi -; FALLBACK16-NEXT: movl 12(%ecx), %ecx -; FALLBACK16-NEXT: movb (%eax), %ah -; FALLBACK16-NEXT: movb %ah, %al -; FALLBACK16-NEXT: shlb $3, %al -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $12, %ah -; FALLBACK16-NEXT: movzbl %ah, %ebp -; FALLBACK16-NEXT: movl 20(%esp,%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %ebx -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl %eax, %edx -; FALLBACK16-NEXT: notb %dl -; FALLBACK16-NEXT: movl 24(%esp,%ebp), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi -; FALLBACK16-NEXT: movl 16(%esp,%ebp), %ebx -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: addl %esi, %esi -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %ebx, %esi -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; FALLBACK16-NEXT: movl 28(%esp,%ebp), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl %ebx, 12(%edx) -; FALLBACK16-NEXT: movl %ebp, 8(%edx) -; FALLBACK16-NEXT: movl %esi, (%edx) -; FALLBACK16-NEXT: movl %edi, 4(%edx) -; FALLBACK16-NEXT: addl $60, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: lshr_16bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $44, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK17-NEXT: movl (%edx), %esi -; FALLBACK17-NEXT: movl 4(%edx), %edi -; FALLBACK17-NEXT: movl 8(%edx), %ebx -; FALLBACK17-NEXT: movl 12(%edx), %edx -; FALLBACK17-NEXT: movb (%ecx), %ch -; FALLBACK17-NEXT: movb %ch, %cl -; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, (%esp) -; FALLBACK17-NEXT: andb $12, %ch -; FALLBACK17-NEXT: movzbl %ch, %ebx -; FALLBACK17-NEXT: movl 8(%esp,%ebx), %esi -; FALLBACK17-NEXT: movl (%esp,%ebx), %edx -; FALLBACK17-NEXT: movl 4(%esp,%ebx), %ebp -; FALLBACK17-NEXT: movl %ebp, %edi -; FALLBACK17-NEXT: shrdl %cl, %esi, %edi -; FALLBACK17-NEXT: movl 12(%esp,%ebx), %ebx -; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi -; FALLBACK17-NEXT: shrdl %cl, %ebp, %edx -; FALLBACK17-NEXT: shrl %cl, %ebx -; FALLBACK17-NEXT: movl %esi, 8(%eax) -; FALLBACK17-NEXT: movl %ebx, 12(%eax) -; FALLBACK17-NEXT: movl %edx, (%eax) -; FALLBACK17-NEXT: movl %edi, 4(%eax) -; FALLBACK17-NEXT: addl $44, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: lshr_16bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $44, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK18-NEXT: movl (%ecx), %edx -; FALLBACK18-NEXT: movl 4(%ecx), %esi -; FALLBACK18-NEXT: movl 8(%ecx), %edi -; FALLBACK18-NEXT: movl 12(%ecx), %ecx -; FALLBACK18-NEXT: movzbl (%eax), %ebx -; FALLBACK18-NEXT: movl %ebx, %eax -; FALLBACK18-NEXT: shlb $3, %al -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, (%esp) -; FALLBACK18-NEXT: movl %eax, %ecx -; FALLBACK18-NEXT: andb $12, %bl -; FALLBACK18-NEXT: movzbl %bl, %edi -; FALLBACK18-NEXT: movl 4(%esp,%edi), %ebx -; FALLBACK18-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK18-NEXT: notb %al -; FALLBACK18-NEXT: leal (%esi,%esi), %edx -; FALLBACK18-NEXT: shlxl %eax, %edx, %edx -; FALLBACK18-NEXT: orl %ebp, %edx -; FALLBACK18-NEXT: shrxl %ecx, (%esp,%edi), %ebp -; FALLBACK18-NEXT: addl %ebx, %ebx -; FALLBACK18-NEXT: shlxl %eax, %ebx, %ebx -; FALLBACK18-NEXT: orl %ebp, %ebx -; FALLBACK18-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK18-NEXT: leal (%edi,%edi), %ebp -; FALLBACK18-NEXT: shlxl %eax, %ebp, %eax -; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: shrxl %ecx, %edi, %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK18-NEXT: movl %ecx, 12(%esi) -; FALLBACK18-NEXT: movl %eax, 8(%esi) -; FALLBACK18-NEXT: movl %ebx, (%esi) -; FALLBACK18-NEXT: movl %edx, 4(%esi) -; FALLBACK18-NEXT: addl $44, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: lshr_16bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $44, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK19-NEXT: movl (%edx), %esi -; FALLBACK19-NEXT: movl 4(%edx), %edi -; FALLBACK19-NEXT: movl 8(%edx), %ebx -; FALLBACK19-NEXT: movl 12(%edx), %edx -; FALLBACK19-NEXT: movzbl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, %ecx -; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, (%esp) -; FALLBACK19-NEXT: andb $12, %al -; FALLBACK19-NEXT: movzbl %al, %eax -; FALLBACK19-NEXT: movl 8(%esp,%eax), %ebx -; FALLBACK19-NEXT: movl (%esp,%eax), %edx -; FALLBACK19-NEXT: movl 4(%esp,%eax), %esi -; FALLBACK19-NEXT: movl %esi, %edi -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edi -; FALLBACK19-NEXT: movl 12(%esp,%eax), %eax -; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK19-NEXT: movl %ebx, 8(%ebp) -; FALLBACK19-NEXT: shrxl %ecx, %eax, %eax -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: shrdl %cl, %esi, %edx -; FALLBACK19-NEXT: movl %edx, (%ebp) -; FALLBACK19-NEXT: movl %edi, 4(%ebp) -; FALLBACK19-NEXT: addl $44, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: lshr_16bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $60, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movzbl (%eax), %ecx -; FALLBACK20-NEXT: movl %ecx, %eax -; FALLBACK20-NEXT: shlb $3, %al -; FALLBACK20-NEXT: xorps %xmm1, %xmm1 -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: andb $12, %cl -; FALLBACK20-NEXT: movzbl %cl, %edi -; FALLBACK20-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl 20(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 24(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, %esi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: movl 28(%esp,%edi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %esi, %ebp -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %esi, %ebx -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: movl %edi, 12(%edx) -; FALLBACK20-NEXT: movl %ebx, 4(%edx) -; FALLBACK20-NEXT: movl %ebp, 8(%edx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, (%edx) -; FALLBACK20-NEXT: addl $60, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: lshr_16bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $44, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK21-NEXT: movups (%edx), %xmm0 -; FALLBACK21-NEXT: movzbl (%ecx), %edx -; FALLBACK21-NEXT: movl %edx, %ecx -; FALLBACK21-NEXT: shlb $3, %cl -; FALLBACK21-NEXT: xorps %xmm1, %xmm1 -; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, (%esp) -; FALLBACK21-NEXT: andb $12, %dl -; FALLBACK21-NEXT: movzbl %dl, %ebx -; FALLBACK21-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK21-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK21-NEXT: movl %ebp, %edi -; FALLBACK21-NEXT: shrdl %cl, %edx, %edi -; FALLBACK21-NEXT: movl (%esp,%ebx), %esi -; FALLBACK21-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK21-NEXT: movl %eax, %ebx -; FALLBACK21-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %ebx, 4(%ebp) -; FALLBACK21-NEXT: movl %edi, 8(%ebp) -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: shrl %cl, %edx -; FALLBACK21-NEXT: movl %edx, 12(%ebp) -; FALLBACK21-NEXT: movl %esi, (%ebp) -; FALLBACK21-NEXT: addl $44, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: lshr_16bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $60, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movzbl (%eax), %edx -; FALLBACK22-NEXT: movl %edx, %eax -; FALLBACK22-NEXT: shlb $3, %al -; FALLBACK22-NEXT: xorps %xmm1, %xmm1 -; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: andb $12, %dl -; FALLBACK22-NEXT: movzbl %dl, %edi -; FALLBACK22-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp -; FALLBACK22-NEXT: notb %al -; FALLBACK22-NEXT: movl 20(%esp,%edi), %edx -; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 24(%esp,%edi), %ebx -; FALLBACK22-NEXT: addl %edx, %edx -; FALLBACK22-NEXT: shlxl %eax, %edx, %edx -; FALLBACK22-NEXT: orl %ebp, %edx -; FALLBACK22-NEXT: movl 28(%esp,%edi), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %edi -; FALLBACK22-NEXT: shlxl %eax, %edi, %edi -; FALLBACK22-NEXT: shrxl %ecx, %ebx, %esi -; FALLBACK22-NEXT: orl %esi, %edi -; FALLBACK22-NEXT: addl %ebx, %ebx -; FALLBACK22-NEXT: shlxl %eax, %ebx, %eax -; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: shrxl %ecx, %ebp, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl %ecx, 12(%esi) -; FALLBACK22-NEXT: movl %eax, 4(%esi) -; FALLBACK22-NEXT: movl %edi, 8(%esi) -; FALLBACK22-NEXT: movl %edx, (%esi) -; FALLBACK22-NEXT: addl $60, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: lshr_16bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $44, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK23-NEXT: movups (%edx), %xmm0 -; FALLBACK23-NEXT: movzbl (%ecx), %edx -; FALLBACK23-NEXT: movl %edx, %ecx -; FALLBACK23-NEXT: shlb $3, %cl -; FALLBACK23-NEXT: xorps %xmm1, %xmm1 -; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, (%esp) -; FALLBACK23-NEXT: andb $12, %dl -; FALLBACK23-NEXT: movzbl %dl, %ebx -; FALLBACK23-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK23-NEXT: movl %ebp, %edi -; FALLBACK23-NEXT: shrdl %cl, %edx, %edi -; FALLBACK23-NEXT: movl (%esp,%ebx), %esi -; FALLBACK23-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %ebx -; FALLBACK23-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK23-NEXT: movl %ebx, 4(%ebp) -; FALLBACK23-NEXT: movl %edi, 8(%ebp) -; FALLBACK23-NEXT: shrxl %ecx, %edx, %edx -; FALLBACK23-NEXT: movl %edx, 12(%ebp) -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl %esi, (%ebp) -; FALLBACK23-NEXT: addl $44, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: lshr_16bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $60, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK24-NEXT: movzbl (%eax), %ecx -; FALLBACK24-NEXT: movl %ecx, %eax -; FALLBACK24-NEXT: shlb $3, %al -; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: andb $12, %cl -; FALLBACK24-NEXT: movzbl %cl, %edi -; FALLBACK24-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl 20(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 24(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, %esi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: movl 28(%esp,%edi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %esi, %ebp -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %esi, %ebx -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: movl %edi, 12(%edx) -; FALLBACK24-NEXT: movl %ebx, 4(%edx) -; FALLBACK24-NEXT: movl %ebp, 8(%edx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, (%edx) -; FALLBACK24-NEXT: addl $60, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: lshr_16bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $44, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK25-NEXT: vmovups (%edx), %xmm0 -; FALLBACK25-NEXT: movzbl (%ecx), %edx -; FALLBACK25-NEXT: movl %edx, %ecx -; FALLBACK25-NEXT: shlb $3, %cl -; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK25-NEXT: andb $12, %dl -; FALLBACK25-NEXT: movzbl %dl, %ebx -; FALLBACK25-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK25-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK25-NEXT: movl %ebp, %edi -; FALLBACK25-NEXT: shrdl %cl, %edx, %edi -; FALLBACK25-NEXT: movl (%esp,%ebx), %esi -; FALLBACK25-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK25-NEXT: movl %eax, %ebx -; FALLBACK25-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %ebx, 4(%ebp) -; FALLBACK25-NEXT: movl %edi, 8(%ebp) -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: shrl %cl, %edx -; FALLBACK25-NEXT: movl %edx, 12(%ebp) -; FALLBACK25-NEXT: movl %esi, (%ebp) -; FALLBACK25-NEXT: addl $44, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: lshr_16bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $60, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movzbl (%eax), %edx -; FALLBACK26-NEXT: movl %edx, %eax -; FALLBACK26-NEXT: shlb $3, %al -; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: andb $12, %dl -; FALLBACK26-NEXT: movzbl %dl, %edi -; FALLBACK26-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp -; FALLBACK26-NEXT: notb %al -; FALLBACK26-NEXT: movl 20(%esp,%edi), %edx -; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 24(%esp,%edi), %ebx -; FALLBACK26-NEXT: addl %edx, %edx -; FALLBACK26-NEXT: shlxl %eax, %edx, %edx -; FALLBACK26-NEXT: orl %ebp, %edx -; FALLBACK26-NEXT: movl 28(%esp,%edi), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %edi -; FALLBACK26-NEXT: shlxl %eax, %edi, %edi -; FALLBACK26-NEXT: shrxl %ecx, %ebx, %esi -; FALLBACK26-NEXT: orl %esi, %edi -; FALLBACK26-NEXT: addl %ebx, %ebx -; FALLBACK26-NEXT: shlxl %eax, %ebx, %eax -; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: shrxl %ecx, %ebp, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl %ecx, 12(%esi) -; FALLBACK26-NEXT: movl %eax, 4(%esi) -; FALLBACK26-NEXT: movl %edi, 8(%esi) -; FALLBACK26-NEXT: movl %edx, (%esi) -; FALLBACK26-NEXT: addl $60, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: lshr_16bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $44, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK27-NEXT: vmovups (%edx), %xmm0 -; FALLBACK27-NEXT: movzbl (%ecx), %edx -; FALLBACK27-NEXT: movl %edx, %ecx -; FALLBACK27-NEXT: shlb $3, %cl -; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK27-NEXT: andb $12, %dl -; FALLBACK27-NEXT: movzbl %dl, %ebx -; FALLBACK27-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK27-NEXT: movl %ebp, %edi -; FALLBACK27-NEXT: shrdl %cl, %edx, %edi -; FALLBACK27-NEXT: movl (%esp,%ebx), %esi -; FALLBACK27-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %ebx -; FALLBACK27-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK27-NEXT: movl %ebx, 4(%ebp) -; FALLBACK27-NEXT: movl %edi, 8(%ebp) -; FALLBACK27-NEXT: shrxl %ecx, %edx, %edx -; FALLBACK27-NEXT: movl %edx, 12(%ebp) -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl %esi, (%ebp) -; FALLBACK27-NEXT: addl $44, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: lshr_16bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $60, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK28-NEXT: movzbl (%eax), %ecx -; FALLBACK28-NEXT: movl %ecx, %eax -; FALLBACK28-NEXT: shlb $3, %al -; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: andb $12, %cl -; FALLBACK28-NEXT: movzbl %cl, %edi -; FALLBACK28-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl 20(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 24(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, %esi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: movl 28(%esp,%edi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %esi, %ebp -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %esi, %ebx -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: movl %edi, 12(%edx) -; FALLBACK28-NEXT: movl %ebx, 4(%edx) -; FALLBACK28-NEXT: movl %ebp, 8(%edx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, (%edx) -; FALLBACK28-NEXT: addl $60, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: lshr_16bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $44, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK29-NEXT: vmovups (%edx), %xmm0 -; FALLBACK29-NEXT: movzbl (%ecx), %edx -; FALLBACK29-NEXT: movl %edx, %ecx -; FALLBACK29-NEXT: shlb $3, %cl -; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK29-NEXT: andb $12, %dl -; FALLBACK29-NEXT: movzbl %dl, %ebx -; FALLBACK29-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK29-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK29-NEXT: movl %ebp, %edi -; FALLBACK29-NEXT: shrdl %cl, %edx, %edi -; FALLBACK29-NEXT: movl (%esp,%ebx), %esi -; FALLBACK29-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK29-NEXT: movl %eax, %ebx -; FALLBACK29-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %ebx, 4(%ebp) -; FALLBACK29-NEXT: movl %edi, 8(%ebp) -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: shrl %cl, %edx -; FALLBACK29-NEXT: movl %edx, 12(%ebp) -; FALLBACK29-NEXT: movl %esi, (%ebp) -; FALLBACK29-NEXT: addl $44, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: lshr_16bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $60, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movzbl (%eax), %edx -; FALLBACK30-NEXT: movl %edx, %eax -; FALLBACK30-NEXT: shlb $3, %al -; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: andb $12, %dl -; FALLBACK30-NEXT: movzbl %dl, %edi -; FALLBACK30-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp -; FALLBACK30-NEXT: notb %al -; FALLBACK30-NEXT: movl 20(%esp,%edi), %edx -; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 24(%esp,%edi), %ebx -; FALLBACK30-NEXT: addl %edx, %edx -; FALLBACK30-NEXT: shlxl %eax, %edx, %edx -; FALLBACK30-NEXT: orl %ebp, %edx -; FALLBACK30-NEXT: movl 28(%esp,%edi), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %edi -; FALLBACK30-NEXT: shlxl %eax, %edi, %edi -; FALLBACK30-NEXT: shrxl %ecx, %ebx, %esi -; FALLBACK30-NEXT: orl %esi, %edi -; FALLBACK30-NEXT: addl %ebx, %ebx -; FALLBACK30-NEXT: shlxl %eax, %ebx, %eax -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: shrxl %ecx, %ebp, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl %ecx, 12(%esi) -; FALLBACK30-NEXT: movl %eax, 4(%esi) -; FALLBACK30-NEXT: movl %edi, 8(%esi) -; FALLBACK30-NEXT: movl %edx, (%esi) -; FALLBACK30-NEXT: addl $60, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: lshr_16bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $44, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK31-NEXT: vmovups (%edx), %xmm0 -; FALLBACK31-NEXT: movzbl (%ecx), %edx -; FALLBACK31-NEXT: movl %edx, %ecx -; FALLBACK31-NEXT: shlb $3, %cl -; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK31-NEXT: andb $12, %dl -; FALLBACK31-NEXT: movzbl %dl, %ebx -; FALLBACK31-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK31-NEXT: movl %ebp, %edi -; FALLBACK31-NEXT: shrdl %cl, %edx, %edi -; FALLBACK31-NEXT: movl (%esp,%ebx), %esi -; FALLBACK31-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %ebx -; FALLBACK31-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK31-NEXT: movl %ebx, 4(%ebp) -; FALLBACK31-NEXT: movl %edi, 8(%ebp) -; FALLBACK31-NEXT: shrxl %ecx, %edx, %edx -; FALLBACK31-NEXT: movl %edx, 12(%ebp) -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl %esi, (%ebp) -; FALLBACK31-NEXT: addl $44, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: retl +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_16bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb (%eax), %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ah, %al +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $12, %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %ah, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 12(%edx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 8(%edx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%edx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 4(%edx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_16bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%edx), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%edx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%edx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb (%ecx), %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $12, %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %ch, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%esp,%ebx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp,%ebx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%esp,%ebx), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%esp,%ebx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %ebp, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 12(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_16bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $12, %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %bl, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%esp,%edi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%esp,%edi), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, (%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%esp,%edi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 12(%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, (%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 4(%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%edx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%edx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%edx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $12, %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %al, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%esp,%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp,%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%esp,%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%esp,%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_16bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $12, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%esp,%edi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 12(%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 4(%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 8(%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_16bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%edx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $12, %dl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %dl, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 12(%esp,%ebx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 8(%esp,%ebx), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp,%ebx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 4(%esp,%ebx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_16bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $60, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $12, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %dl, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%esp,%edi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%esp,%edi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%ebp,%ebp), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %ebx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 8(%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, (%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $60, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%edx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $12, %dl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %dl, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 12(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 8(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 4(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_16bytes: +; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $12, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 16(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 20(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 24(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 28(%esp,%edi), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 12(%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 4(%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 8(%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_16bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%edx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, (%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $12, %dl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %dl, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 12(%esp,%ebx), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 8(%esp,%ebx), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%esp,%ebx), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 4(%esp,%ebx), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebp, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_16bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $60, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $12, %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %dl, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%esp,%edi), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%esp,%edi), %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %eax, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 28(%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%ebp,%ebp), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, %ebx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %eax, %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 12(%esi) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 4(%esi) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 8(%esi) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, (%esi) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $60, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%edx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $12, %dl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %dl, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 12(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 8(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 4(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %byteOff = load i128, ptr %byteOff.ptr, align 1 %bitOff = shl i128 %byteOff, 3 @@ -1664,791 +1473,599 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; -; FALLBACK16-LABEL: shl_16bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $60, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl (%ecx), %ebx -; FALLBACK16-NEXT: movl 4(%ecx), %esi -; FALLBACK16-NEXT: movl 8(%ecx), %edi -; FALLBACK16-NEXT: movl 12(%ecx), %ecx -; FALLBACK16-NEXT: movb (%eax), %ah -; FALLBACK16-NEXT: movb %ah, %dh -; FALLBACK16-NEXT: shlb $3, %dh -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $12, %ah -; FALLBACK16-NEXT: negb %ah -; FALLBACK16-NEXT: movsbl %ah, %ebp -; FALLBACK16-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%esp,%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %edi -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movb %dh, %dl -; FALLBACK16-NEXT: notb %dl -; FALLBACK16-NEXT: shrl %ebx -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: movl 40(%esp,%ebp), %edi -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: shrl %esi -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: orl %edi, %esi -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: movl %edx, (%eax) -; FALLBACK16-NEXT: movl %esi, 8(%eax) -; FALLBACK16-NEXT: movl %ebp, 12(%eax) -; FALLBACK16-NEXT: movl %ebx, 4(%eax) -; FALLBACK16-NEXT: addl $60, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: shl_16bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $32, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK17-NEXT: movl (%edx), %esi -; FALLBACK17-NEXT: movl 4(%edx), %edi -; FALLBACK17-NEXT: movl 8(%edx), %ebx -; FALLBACK17-NEXT: movl 12(%edx), %edx -; FALLBACK17-NEXT: movb (%ecx), %ch -; FALLBACK17-NEXT: movb %ch, %cl -; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, (%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: andb $12, %ch -; FALLBACK17-NEXT: negb %ch -; FALLBACK17-NEXT: movsbl %ch, %edi -; FALLBACK17-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK17-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK17-NEXT: shldl %cl, %esi, %edx -; FALLBACK17-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK17-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK17-NEXT: shldl %cl, %edi, %esi -; FALLBACK17-NEXT: shldl %cl, %ebx, %edi -; FALLBACK17-NEXT: shll %cl, %ebx -; FALLBACK17-NEXT: movl %esi, 8(%eax) -; FALLBACK17-NEXT: movl %edx, 12(%eax) -; FALLBACK17-NEXT: movl %ebx, (%eax) -; FALLBACK17-NEXT: movl %edi, 4(%eax) -; FALLBACK17-NEXT: addl $32, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: shl_16bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $44, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK18-NEXT: movl (%ecx), %edx -; FALLBACK18-NEXT: movl 4(%ecx), %esi -; FALLBACK18-NEXT: movl 8(%ecx), %edi -; FALLBACK18-NEXT: movl 12(%ecx), %ecx -; FALLBACK18-NEXT: movzbl (%eax), %ebx -; FALLBACK18-NEXT: movl %ebx, %eax -; FALLBACK18-NEXT: shlb $3, %al -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, (%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, %ecx -; FALLBACK18-NEXT: andb $12, %bl -; FALLBACK18-NEXT: negb %bl -; FALLBACK18-NEXT: movsbl %bl, %esi -; FALLBACK18-NEXT: movl 16(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl 20(%esp,%esi), %edx -; FALLBACK18-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK18-NEXT: notb %al -; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebp -; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx -; FALLBACK18-NEXT: orl %edi, %ebx -; FALLBACK18-NEXT: shlxl %ecx, 28(%esp,%esi), %edi -; FALLBACK18-NEXT: movl 24(%esp,%esi), %esi -; FALLBACK18-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %eax, %esi, %esi -; FALLBACK18-NEXT: orl %edi, %esi -; FALLBACK18-NEXT: shrl %edx -; FALLBACK18-NEXT: shrxl %eax, %edx, %eax -; FALLBACK18-NEXT: orl %ecx, %eax -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK18-NEXT: movl %ebp, (%ecx) -; FALLBACK18-NEXT: movl %eax, 8(%ecx) -; FALLBACK18-NEXT: movl %esi, 12(%ecx) -; FALLBACK18-NEXT: movl %ebx, 4(%ecx) -; FALLBACK18-NEXT: addl $44, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: shl_16bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $44, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK19-NEXT: movl (%edx), %esi -; FALLBACK19-NEXT: movl 4(%edx), %edi -; FALLBACK19-NEXT: movl 8(%edx), %ebx -; FALLBACK19-NEXT: movl 12(%edx), %edx -; FALLBACK19-NEXT: movzbl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, %ecx -; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, (%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: andb $12, %al -; FALLBACK19-NEXT: negb %al -; FALLBACK19-NEXT: movsbl %al, %eax -; FALLBACK19-NEXT: movl 24(%esp,%eax), %esi -; FALLBACK19-NEXT: movl 28(%esp,%eax), %edx -; FALLBACK19-NEXT: shldl %cl, %esi, %edx -; FALLBACK19-NEXT: movl 16(%esp,%eax), %edi -; FALLBACK19-NEXT: movl 20(%esp,%eax), %eax -; FALLBACK19-NEXT: shldl %cl, %eax, %esi -; FALLBACK19-NEXT: shldl %cl, %edi, %eax -; FALLBACK19-NEXT: shlxl %ecx, %edi, %ecx -; FALLBACK19-NEXT: movl %esi, 8(%ebp) -; FALLBACK19-NEXT: movl %edx, 12(%ebp) -; FALLBACK19-NEXT: movl %ecx, (%ebp) -; FALLBACK19-NEXT: movl %eax, 4(%ebp) -; FALLBACK19-NEXT: addl $44, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: shl_16bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $60, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movzbl (%eax), %ecx -; FALLBACK20-NEXT: movl %ecx, %eax -; FALLBACK20-NEXT: shlb $3, %al -; FALLBACK20-NEXT: xorps %xmm1, %xmm1 -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: andb $12, %cl -; FALLBACK20-NEXT: negb %cl -; FALLBACK20-NEXT: movsbl %cl, %edi -; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: movl 40(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %esi -; FALLBACK20-NEXT: shrl %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: movl 32(%esp,%edi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, %edi -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %ebp, %edi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: movl %eax, (%edx) -; FALLBACK20-NEXT: movl %ebp, 4(%edx) -; FALLBACK20-NEXT: movl %edi, 8(%edx) -; FALLBACK20-NEXT: movl %esi, 12(%edx) -; FALLBACK20-NEXT: addl $60, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: shl_16bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $44, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK21-NEXT: movups (%edx), %xmm0 -; FALLBACK21-NEXT: movzbl (%ecx), %edx -; FALLBACK21-NEXT: movl %edx, %ecx -; FALLBACK21-NEXT: shlb $3, %cl -; FALLBACK21-NEXT: xorps %xmm1, %xmm1 -; FALLBACK21-NEXT: movaps %xmm1, (%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: andb $12, %dl -; FALLBACK21-NEXT: negb %dl -; FALLBACK21-NEXT: movsbl %dl, %edi -; FALLBACK21-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK21-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK21-NEXT: shldl %cl, %esi, %edx -; FALLBACK21-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK21-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK21-NEXT: shldl %cl, %edi, %esi -; FALLBACK21-NEXT: movl %ebx, %ebp -; FALLBACK21-NEXT: shll %cl, %ebp -; FALLBACK21-NEXT: shldl %cl, %ebx, %edi -; FALLBACK21-NEXT: movl %edi, 4(%eax) -; FALLBACK21-NEXT: movl %esi, 8(%eax) -; FALLBACK21-NEXT: movl %edx, 12(%eax) -; FALLBACK21-NEXT: movl %ebp, (%eax) -; FALLBACK21-NEXT: addl $44, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: shl_16bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $44, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movzbl (%eax), %edx -; FALLBACK22-NEXT: movl %edx, %eax -; FALLBACK22-NEXT: shlb $3, %al -; FALLBACK22-NEXT: xorps %xmm1, %xmm1 -; FALLBACK22-NEXT: movaps %xmm1, (%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: andb $12, %dl -; FALLBACK22-NEXT: negb %dl -; FALLBACK22-NEXT: movsbl %dl, %edx -; FALLBACK22-NEXT: shlxl %ecx, 28(%esp,%edx), %edi -; FALLBACK22-NEXT: notb %al -; FALLBACK22-NEXT: movl 24(%esp,%edx), %esi -; FALLBACK22-NEXT: shlxl %ecx, %esi, %ebx -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %eax, %esi, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl 20(%esp,%edx), %edi -; FALLBACK22-NEXT: movl %edi, %ebp -; FALLBACK22-NEXT: shrl %ebp -; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK22-NEXT: orl %ebx, %ebp -; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK22-NEXT: movl 16(%esp,%edx), %edx -; FALLBACK22-NEXT: shlxl %ecx, %edx, %ecx -; FALLBACK22-NEXT: shrl %edx -; FALLBACK22-NEXT: shrxl %eax, %edx, %eax -; FALLBACK22-NEXT: orl %edi, %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %ecx, (%edx) -; FALLBACK22-NEXT: movl %eax, 4(%edx) -; FALLBACK22-NEXT: movl %ebp, 8(%edx) -; FALLBACK22-NEXT: movl %esi, 12(%edx) -; FALLBACK22-NEXT: addl $44, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: shl_16bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $44, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK23-NEXT: movups (%edx), %xmm0 -; FALLBACK23-NEXT: movzbl (%ecx), %edx -; FALLBACK23-NEXT: movl %edx, %ecx -; FALLBACK23-NEXT: shlb $3, %cl -; FALLBACK23-NEXT: xorps %xmm1, %xmm1 -; FALLBACK23-NEXT: movaps %xmm1, (%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: andb $12, %dl -; FALLBACK23-NEXT: negb %dl -; FALLBACK23-NEXT: movsbl %dl, %edi -; FALLBACK23-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK23-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK23-NEXT: shldl %cl, %esi, %edx -; FALLBACK23-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK23-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK23-NEXT: shldl %cl, %edi, %esi -; FALLBACK23-NEXT: shlxl %ecx, %ebx, %ebp -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: shldl %cl, %ebx, %edi -; FALLBACK23-NEXT: movl %edi, 4(%eax) -; FALLBACK23-NEXT: movl %esi, 8(%eax) -; FALLBACK23-NEXT: movl %edx, 12(%eax) -; FALLBACK23-NEXT: movl %ebp, (%eax) -; FALLBACK23-NEXT: addl $44, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: shl_16bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $60, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK24-NEXT: movzbl (%eax), %ecx -; FALLBACK24-NEXT: movl %ecx, %eax -; FALLBACK24-NEXT: shlb $3, %al -; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: andb $12, %cl -; FALLBACK24-NEXT: negb %cl -; FALLBACK24-NEXT: movsbl %cl, %edi -; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: movl 40(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %esi -; FALLBACK24-NEXT: shrl %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: movl 32(%esp,%edi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, %edi -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %ebp, %edi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: movl %eax, (%edx) -; FALLBACK24-NEXT: movl %ebp, 4(%edx) -; FALLBACK24-NEXT: movl %edi, 8(%edx) -; FALLBACK24-NEXT: movl %esi, 12(%edx) -; FALLBACK24-NEXT: addl $60, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: shl_16bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $44, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK25-NEXT: vmovups (%edx), %xmm0 -; FALLBACK25-NEXT: movzbl (%ecx), %edx -; FALLBACK25-NEXT: movl %edx, %ecx -; FALLBACK25-NEXT: shlb $3, %cl -; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK25-NEXT: vmovaps %xmm1, (%esp) -; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: andb $12, %dl -; FALLBACK25-NEXT: negb %dl -; FALLBACK25-NEXT: movsbl %dl, %edi -; FALLBACK25-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK25-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK25-NEXT: shldl %cl, %esi, %edx -; FALLBACK25-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK25-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK25-NEXT: shldl %cl, %edi, %esi -; FALLBACK25-NEXT: movl %ebx, %ebp -; FALLBACK25-NEXT: shll %cl, %ebp -; FALLBACK25-NEXT: shldl %cl, %ebx, %edi -; FALLBACK25-NEXT: movl %edi, 4(%eax) -; FALLBACK25-NEXT: movl %esi, 8(%eax) -; FALLBACK25-NEXT: movl %edx, 12(%eax) -; FALLBACK25-NEXT: movl %ebp, (%eax) -; FALLBACK25-NEXT: addl $44, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: shl_16bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $44, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movzbl (%eax), %edx -; FALLBACK26-NEXT: movl %edx, %eax -; FALLBACK26-NEXT: shlb $3, %al -; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK26-NEXT: vmovaps %xmm1, (%esp) -; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: andb $12, %dl -; FALLBACK26-NEXT: negb %dl -; FALLBACK26-NEXT: movsbl %dl, %edx -; FALLBACK26-NEXT: shlxl %ecx, 28(%esp,%edx), %edi -; FALLBACK26-NEXT: notb %al -; FALLBACK26-NEXT: movl 24(%esp,%edx), %esi -; FALLBACK26-NEXT: shlxl %ecx, %esi, %ebx -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %eax, %esi, %esi -; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: movl 20(%esp,%edx), %edi -; FALLBACK26-NEXT: movl %edi, %ebp -; FALLBACK26-NEXT: shrl %ebp -; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK26-NEXT: orl %ebx, %ebp -; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK26-NEXT: movl 16(%esp,%edx), %edx -; FALLBACK26-NEXT: shlxl %ecx, %edx, %ecx -; FALLBACK26-NEXT: shrl %edx -; FALLBACK26-NEXT: shrxl %eax, %edx, %eax -; FALLBACK26-NEXT: orl %edi, %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %ecx, (%edx) -; FALLBACK26-NEXT: movl %eax, 4(%edx) -; FALLBACK26-NEXT: movl %ebp, 8(%edx) -; FALLBACK26-NEXT: movl %esi, 12(%edx) -; FALLBACK26-NEXT: addl $44, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: shl_16bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $44, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK27-NEXT: vmovups (%edx), %xmm0 -; FALLBACK27-NEXT: movzbl (%ecx), %edx -; FALLBACK27-NEXT: movl %edx, %ecx -; FALLBACK27-NEXT: shlb $3, %cl -; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK27-NEXT: vmovaps %xmm1, (%esp) -; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: andb $12, %dl -; FALLBACK27-NEXT: negb %dl -; FALLBACK27-NEXT: movsbl %dl, %edi -; FALLBACK27-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK27-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK27-NEXT: shldl %cl, %esi, %edx -; FALLBACK27-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK27-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK27-NEXT: shldl %cl, %edi, %esi -; FALLBACK27-NEXT: shlxl %ecx, %ebx, %ebp -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: shldl %cl, %ebx, %edi -; FALLBACK27-NEXT: movl %edi, 4(%eax) -; FALLBACK27-NEXT: movl %esi, 8(%eax) -; FALLBACK27-NEXT: movl %edx, 12(%eax) -; FALLBACK27-NEXT: movl %ebp, (%eax) -; FALLBACK27-NEXT: addl $44, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: shl_16bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $60, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK28-NEXT: movzbl (%eax), %ecx -; FALLBACK28-NEXT: movl %ecx, %eax -; FALLBACK28-NEXT: shlb $3, %al -; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: andb $12, %cl -; FALLBACK28-NEXT: negb %cl -; FALLBACK28-NEXT: movsbl %cl, %edi -; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: movl 40(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %esi -; FALLBACK28-NEXT: shrl %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: movl 32(%esp,%edi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, %edi -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %ebp, %edi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: movl %eax, (%edx) -; FALLBACK28-NEXT: movl %ebp, 4(%edx) -; FALLBACK28-NEXT: movl %edi, 8(%edx) -; FALLBACK28-NEXT: movl %esi, 12(%edx) -; FALLBACK28-NEXT: addl $60, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: shl_16bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $44, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK29-NEXT: vmovups (%edx), %xmm0 -; FALLBACK29-NEXT: movzbl (%ecx), %edx -; FALLBACK29-NEXT: movl %edx, %ecx -; FALLBACK29-NEXT: shlb $3, %cl -; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK29-NEXT: vmovaps %xmm1, (%esp) -; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: andb $12, %dl -; FALLBACK29-NEXT: negb %dl -; FALLBACK29-NEXT: movsbl %dl, %edi -; FALLBACK29-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK29-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK29-NEXT: shldl %cl, %esi, %edx -; FALLBACK29-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK29-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK29-NEXT: shldl %cl, %edi, %esi -; FALLBACK29-NEXT: movl %ebx, %ebp -; FALLBACK29-NEXT: shll %cl, %ebp -; FALLBACK29-NEXT: shldl %cl, %ebx, %edi -; FALLBACK29-NEXT: movl %edi, 4(%eax) -; FALLBACK29-NEXT: movl %esi, 8(%eax) -; FALLBACK29-NEXT: movl %edx, 12(%eax) -; FALLBACK29-NEXT: movl %ebp, (%eax) -; FALLBACK29-NEXT: addl $44, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: shl_16bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $44, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movzbl (%eax), %edx -; FALLBACK30-NEXT: movl %edx, %eax -; FALLBACK30-NEXT: shlb $3, %al -; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK30-NEXT: vmovaps %xmm1, (%esp) -; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: andb $12, %dl -; FALLBACK30-NEXT: negb %dl -; FALLBACK30-NEXT: movsbl %dl, %edx -; FALLBACK30-NEXT: shlxl %ecx, 28(%esp,%edx), %edi -; FALLBACK30-NEXT: notb %al -; FALLBACK30-NEXT: movl 24(%esp,%edx), %esi -; FALLBACK30-NEXT: shlxl %ecx, %esi, %ebx -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %eax, %esi, %esi -; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: movl 20(%esp,%edx), %edi -; FALLBACK30-NEXT: movl %edi, %ebp -; FALLBACK30-NEXT: shrl %ebp -; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK30-NEXT: orl %ebx, %ebp -; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK30-NEXT: movl 16(%esp,%edx), %edx -; FALLBACK30-NEXT: shlxl %ecx, %edx, %ecx -; FALLBACK30-NEXT: shrl %edx -; FALLBACK30-NEXT: shrxl %eax, %edx, %eax -; FALLBACK30-NEXT: orl %edi, %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %ecx, (%edx) -; FALLBACK30-NEXT: movl %eax, 4(%edx) -; FALLBACK30-NEXT: movl %ebp, 8(%edx) -; FALLBACK30-NEXT: movl %esi, 12(%edx) -; FALLBACK30-NEXT: addl $44, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: shl_16bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $44, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK31-NEXT: vmovups (%edx), %xmm0 -; FALLBACK31-NEXT: movzbl (%ecx), %edx -; FALLBACK31-NEXT: movl %edx, %ecx -; FALLBACK31-NEXT: shlb $3, %cl -; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK31-NEXT: vmovaps %xmm1, (%esp) -; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: andb $12, %dl -; FALLBACK31-NEXT: negb %dl -; FALLBACK31-NEXT: movsbl %dl, %edi -; FALLBACK31-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK31-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK31-NEXT: shldl %cl, %esi, %edx -; FALLBACK31-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK31-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK31-NEXT: shldl %cl, %edi, %esi -; FALLBACK31-NEXT: shlxl %ecx, %ebx, %ebp -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: shldl %cl, %ebx, %edi -; FALLBACK31-NEXT: movl %edi, 4(%eax) -; FALLBACK31-NEXT: movl %esi, 8(%eax) -; FALLBACK31-NEXT: movl %edx, 12(%eax) -; FALLBACK31-NEXT: movl %ebp, (%eax) -; FALLBACK31-NEXT: addl $44, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: retl +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_16bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb (%eax), %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ah, %dh +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %dh +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $12, %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbl %ah, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_16bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $32, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%edx), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%edx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%edx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb (%ecx), %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $12, %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbl %ch, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esp,%edi), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esp,%edi), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esp,%edi), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $32, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_16bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, (%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $12, %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbl %bl, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esp,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, 28(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esp,%esi), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, (%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 12(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 4(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%edx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%edx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%edx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $12, %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbl %al, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esp,%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%esp,%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esp,%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esp,%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edi, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_16bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $12, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 4(%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 8(%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 12(%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_16bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%edx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, (%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $12, %dl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %dl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbl %dl, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%esp,%edi), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%esp,%edi), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%esp,%edi), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, (%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_16bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, (%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $12, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbl %dl, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%esp,%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %esi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%esp,%edx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, (%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%edx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $12, %dl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %dl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbl %dl, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%esp,%edi), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%esp,%edi), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%esp,%edi), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %ebx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX-LABEL: shl_16bytes: +; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $12, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movsbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 4(%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 8(%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 12(%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_16bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%edx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, (%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $12, %dl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %dl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbl %dl, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 28(%esp,%edi), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 16(%esp,%edi), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 20(%esp,%edi), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, (%eax) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_16bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, (%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $12, %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbl %dl, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%esp,%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %esi, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 16(%esp,%edx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, (%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%edx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $12, %dl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %dl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbl %dl, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 28(%esp,%edi), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 16(%esp,%edi), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%esp,%edi), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %ebx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %byteOff = load i128, ptr %byteOff.ptr, align 1 %bitOff = shl i128 %byteOff, 3 @@ -3111,1932 +2728,1477 @@ define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no } define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: lshr_32bytes: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: leal (,%rsi,8), %eax -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $24, %sil -; FALLBACK0-NEXT: movzbl %sil, %r9d -; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r11, %r8 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %rdi, %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: movq %r9, 24(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) -; FALLBACK0-NEXT: movq %rdi, (%rdx) -; FALLBACK0-NEXT: movq %r8, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: lshr_32bytes: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: leal (,%rsi,8), %ecx -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $24, %sil -; FALLBACK1-NEXT: movzbl %sil, %eax -; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi -; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK1-NEXT: movq %r8, %r9 -; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: shrq %cl, %rax -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rax, 24(%rdx) -; FALLBACK1-NEXT: movq %rdi, (%rdx) -; FALLBACK1-NEXT: movq %r9, 8(%rdx) -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: lshr_32bytes: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: leal (,%rsi,8), %eax -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movl %eax, %ecx -; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %esi -; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %r8 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 -; FALLBACK2-NEXT: orq %r9, %r10 -; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r9, %rdi -; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 -; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 -; FALLBACK2-NEXT: shlxq %rax, %r9, %rax -; FALLBACK2-NEXT: orq %r8, %rax -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %rcx -; FALLBACK2-NEXT: movq %rcx, 24(%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r10, 8(%rdx) -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: lshr_32bytes: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: leal (,%rsi,8), %ecx -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $24, %sil -; FALLBACK3-NEXT: movzbl %sil, %eax -; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi -; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK3-NEXT: movq %r8, %r9 -; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rax, 24(%rdx) -; FALLBACK3-NEXT: movq %rdi, (%rdx) -; FALLBACK3-NEXT: movq %r9, 8(%rdx) -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: lshr_32bytes: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movzbl (%rsi), %ecx -; FALLBACK4-NEXT: leal (,%rcx,8), %eax -; FALLBACK4-NEXT: xorps %xmm2, %xmm2 -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: andb $24, %cl -; FALLBACK4-NEXT: movzbl %cl, %r9d -; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK4-NEXT: movq %r10, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r11, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: addq %r10, %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r9 -; FALLBACK4-NEXT: movq %r9, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) -; FALLBACK4-NEXT: movq %rdi, (%rdx) -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: lshr_32bytes: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movzbl (%rsi), %eax -; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: xorps %xmm2, %xmm2 -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: andb $24, %al -; FALLBACK5-NEXT: movzbl %al, %eax -; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK5-NEXT: movq %rdi, %r8 -; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK5-NEXT: movq %rax, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: shrq %cl, %rsi -; FALLBACK5-NEXT: movq %r10, 8(%rdx) -; FALLBACK5-NEXT: movq %r8, 16(%rdx) -; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: lshr_32bytes: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: leal (,%rcx,8), %eax -; FALLBACK6-NEXT: xorps %xmm2, %xmm2 -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movl %eax, %esi -; FALLBACK6-NEXT: andb $24, %cl -; FALLBACK6-NEXT: movzbl %cl, %ecx -; FALLBACK6-NEXT: shrxq %rsi, -72(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %r8 -; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r9 -; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 -; FALLBACK6-NEXT: orq %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rsi, %r9, %rdi -; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: leaq (%rcx,%rcx), %r11 -; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 -; FALLBACK6-NEXT: orq %rdi, %r11 -; FALLBACK6-NEXT: shrxq %rsi, %r8, %rdi -; FALLBACK6-NEXT: addq %r9, %r9 -; FALLBACK6-NEXT: shlxq %rax, %r9, %rax -; FALLBACK6-NEXT: orq %rdi, %rax -; FALLBACK6-NEXT: shrxq %rsi, %rcx, %rcx -; FALLBACK6-NEXT: movq %rcx, 24(%rdx) -; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %r11, 16(%rdx) -; FALLBACK6-NEXT: movq %r10, (%rdx) -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: lshr_32bytes: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movzbl (%rsi), %eax -; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: xorps %xmm2, %xmm2 -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: andb $24, %al -; FALLBACK7-NEXT: movzbl %al, %eax -; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK7-NEXT: movq %rdi, %r8 -; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK7-NEXT: movq %rax, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax -; FALLBACK7-NEXT: movq %r10, 8(%rdx) -; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rax, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: lshr_32bytes: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: movzbl (%rsi), %ecx -; FALLBACK8-NEXT: leal (,%rcx,8), %eax -; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: andb $24, %cl -; FALLBACK8-NEXT: movzbl %cl, %r9d -; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq %r10, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r11, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %r10, %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) -; FALLBACK8-NEXT: movq %rdi, (%rdx) -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: lshr_32bytes: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: movzbl (%rsi), %eax -; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: andb $24, %al -; FALLBACK9-NEXT: movzbl %al, %eax -; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq %rdi, %r8 -; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: shrq %cl, %rsi -; FALLBACK9-NEXT: movq %r10, 8(%rdx) -; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: lshr_32bytes: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %eax -; FALLBACK10-NEXT: leal (,%rax,8), %ecx -; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movl %ecx, %esi -; FALLBACK10-NEXT: andb $24, %al -; FALLBACK10-NEXT: movzbl %al, %eax -; FALLBACK10-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi -; FALLBACK10-NEXT: notb %cl -; FALLBACK10-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK10-NEXT: movq -56(%rsp,%rax), %r9 -; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK10-NEXT: shlxq %rcx, %r10, %r10 -; FALLBACK10-NEXT: orq %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rsi, %r9, %rdi -; FALLBACK10-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK10-NEXT: leaq (%rax,%rax), %r11 -; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 -; FALLBACK10-NEXT: orq %rdi, %r11 -; FALLBACK10-NEXT: shrxq %rsi, %r8, %rdi -; FALLBACK10-NEXT: addq %r9, %r9 -; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx -; FALLBACK10-NEXT: orq %rdi, %rcx -; FALLBACK10-NEXT: shrxq %rsi, %rax, %rax -; FALLBACK10-NEXT: movq %rax, 24(%rdx) -; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %r11, 16(%rdx) -; FALLBACK10-NEXT: movq %r10, (%rdx) -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: lshr_32bytes: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: movzbl (%rsi), %eax -; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: andb $24, %al -; FALLBACK11-NEXT: movzbl %al, %eax -; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK11-NEXT: movq %rdi, %r8 -; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax -; FALLBACK11-NEXT: movq %r10, 8(%rdx) -; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rax, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: lshr_32bytes: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK12-NEXT: movzbl (%rsi), %ecx -; FALLBACK12-NEXT: leal (,%rcx,8), %eax -; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: andb $24, %cl -; FALLBACK12-NEXT: movzbl %cl, %r9d -; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r11, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %r10, %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) -; FALLBACK12-NEXT: movq %rdi, (%rdx) -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: lshr_32bytes: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK13-NEXT: movzbl (%rsi), %eax -; FALLBACK13-NEXT: leal (,%rax,8), %ecx -; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: andb $24, %al -; FALLBACK13-NEXT: movzbl %al, %eax -; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK13-NEXT: movq %rdi, %r8 -; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK13-NEXT: movq %rax, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: shrq %cl, %rsi -; FALLBACK13-NEXT: movq %r10, 8(%rdx) -; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: lshr_32bytes: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %eax -; FALLBACK14-NEXT: leal (,%rax,8), %ecx -; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movl %ecx, %esi -; FALLBACK14-NEXT: andb $24, %al -; FALLBACK14-NEXT: movzbl %al, %eax -; FALLBACK14-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi -; FALLBACK14-NEXT: notb %cl -; FALLBACK14-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK14-NEXT: movq -56(%rsp,%rax), %r9 -; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK14-NEXT: shlxq %rcx, %r10, %r10 -; FALLBACK14-NEXT: orq %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rsi, %r9, %rdi -; FALLBACK14-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK14-NEXT: leaq (%rax,%rax), %r11 -; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 -; FALLBACK14-NEXT: orq %rdi, %r11 -; FALLBACK14-NEXT: shrxq %rsi, %r8, %rdi -; FALLBACK14-NEXT: addq %r9, %r9 -; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx -; FALLBACK14-NEXT: orq %rdi, %rcx -; FALLBACK14-NEXT: shrxq %rsi, %rax, %rax -; FALLBACK14-NEXT: movq %rax, 24(%rdx) -; FALLBACK14-NEXT: movq %rcx, 8(%rdx) -; FALLBACK14-NEXT: movq %r11, 16(%rdx) -; FALLBACK14-NEXT: movq %r10, (%rdx) -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: lshr_32bytes: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK15-NEXT: movzbl (%rsi), %eax -; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: andb $24, %al -; FALLBACK15-NEXT: movzbl %al, %eax -; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq %rdi, %r8 -; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax -; FALLBACK15-NEXT: movq %r10, 8(%rdx) -; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rax, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq -; -; FALLBACK16-LABEL: lshr_32bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $108, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK16-NEXT: movl (%ebp), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ebp), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%ebp), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%ebp), %edi -; FALLBACK16-NEXT: movl 16(%ebp), %ebx -; FALLBACK16-NEXT: movb (%eax), %ah -; FALLBACK16-NEXT: movl 20(%ebp), %esi -; FALLBACK16-NEXT: movl 24(%ebp), %ecx -; FALLBACK16-NEXT: movl 28(%ebp), %ebp -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movb %ah, %dh -; FALLBACK16-NEXT: shlb $3, %dh -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $28, %ah -; FALLBACK16-NEXT: movzbl %ah, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax -; FALLBACK16-NEXT: movl %eax, %ebx -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movb %dh, %dl -; FALLBACK16-NEXT: notb %dl -; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %ebp -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: movl %eax, %ebx -; FALLBACK16-NEXT: addl %eax, %ebx -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %esi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp -; FALLBACK16-NEXT: movl %ebp, %esi -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: movl 48(%esp,%eax), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%eax,%eax), %ebx -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %esi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebp, %ebp -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl 52(%esp,%eax), %edi -; FALLBACK16-NEXT: movl %edi, %ebx -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl 56(%esp,%eax), %esi -; FALLBACK16-NEXT: leal (%esi,%esi), %eax -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebx, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: movl %esi, %eax -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl 60(%esp,%ecx), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %eax, %esi -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %ebx, 28(%eax) -; FALLBACK16-NEXT: movl %esi, 24(%eax) -; FALLBACK16-NEXT: movl %edi, 16(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 20(%eax) -; FALLBACK16-NEXT: movl %ebp, 8(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 12(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, (%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $108, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: lshr_32bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $92, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl (%ebp), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%ebp), %eax -; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%ebp), %esi -; FALLBACK17-NEXT: movl 12(%ebp), %edi -; FALLBACK17-NEXT: movl 16(%ebp), %ebx -; FALLBACK17-NEXT: movb (%ecx), %ch -; FALLBACK17-NEXT: movl 20(%ebp), %edx -; FALLBACK17-NEXT: movl 24(%ebp), %eax -; FALLBACK17-NEXT: movl 28(%ebp), %ebp -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movb %ch, %cl -; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: andb $28, %ch -; FALLBACK17-NEXT: movzbl %ch, %ebp -; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %edx, %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %esi -; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %edx, %edi -; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 24(%ebp) -; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK17-NEXT: shrdl %cl, %edx, %esi -; FALLBACK17-NEXT: shrl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl %ebx, 16(%ebp) -; FALLBACK17-NEXT: movl %edi, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %esi, (%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) -; FALLBACK17-NEXT: addl $92, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: lshr_32bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $108, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %esi -; FALLBACK18-NEXT: movl 12(%eax), %edi -; FALLBACK18-NEXT: movl 16(%eax), %ebp -; FALLBACK18-NEXT: movzbl (%ebx), %ebx -; FALLBACK18-NEXT: movl 20(%eax), %edx -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl 28(%eax), %eax -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, %ecx -; FALLBACK18-NEXT: shlb $3, %cl -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: andb $28, %bl -; FALLBACK18-NEXT: movzbl %bl, %esi -; FALLBACK18-NEXT: movl 36(%esp,%esi), %edx -; FALLBACK18-NEXT: movl 40(%esp,%esi), %ebp -; FALLBACK18-NEXT: shrxl %eax, %edx, %edi -; FALLBACK18-NEXT: notb %cl -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ebx -; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK18-NEXT: orl %edi, %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%esi), %edi -; FALLBACK18-NEXT: addl %edx, %edx -; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK18-NEXT: orl %edi, %edx -; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%esi), %edx -; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: addl %edx, %edx -; FALLBACK18-NEXT: shlxl %ecx, %edx, %ebx -; FALLBACK18-NEXT: movl 44(%esp,%esi), %edx -; FALLBACK18-NEXT: shrxl %eax, %edx, %edi -; FALLBACK18-NEXT: orl %edi, %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %ebp, %edi -; FALLBACK18-NEXT: movl %eax, %ebp -; FALLBACK18-NEXT: addl %edx, %edx -; FALLBACK18-NEXT: shlxl %ecx, %edx, %eax -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%esi), %edi -; FALLBACK18-NEXT: leal (%edi,%edi), %edx -; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK18-NEXT: movl 52(%esp,%esi), %eax -; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebx -; FALLBACK18-NEXT: orl %ebx, %edx -; FALLBACK18-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %ecx, %eax, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl 60(%esp,%esi), %esi -; FALLBACK18-NEXT: leal (%esi,%esi), %ebx -; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ecx -; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi -; FALLBACK18-NEXT: orl %edi, %ecx -; FALLBACK18-NEXT: shrxl %ebp, %esi, %esi -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edi -; FALLBACK18-NEXT: movl %esi, 28(%edi) -; FALLBACK18-NEXT: movl %ecx, 24(%edi) -; FALLBACK18-NEXT: movl %eax, 16(%edi) -; FALLBACK18-NEXT: movl %edx, 20(%edi) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 8(%edi) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 12(%edi) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, (%edi) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 4(%edi) -; FALLBACK18-NEXT: addl $108, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: lshr_32bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $92, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ecx), %esi -; FALLBACK19-NEXT: movl 12(%ecx), %edi -; FALLBACK19-NEXT: movl 16(%ecx), %ebp -; FALLBACK19-NEXT: movzbl (%ebx), %ebx -; FALLBACK19-NEXT: movl 20(%ecx), %edx -; FALLBACK19-NEXT: movl 24(%ecx), %eax -; FALLBACK19-NEXT: movl 28(%ecx), %ecx -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, %ecx -; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: andb $28, %bl -; FALLBACK19-NEXT: movzbl %bl, %ebp -; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %esi, %eax -; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl %edx, %esi -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi -; FALLBACK19-NEXT: shrdl %cl, %edi, %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl %eax, 24(%ebp) -; FALLBACK19-NEXT: shrxl %ecx, %edi, %eax -; FALLBACK19-NEXT: movl %eax, 28(%ebp) -; FALLBACK19-NEXT: movl %ebx, 16(%ebp) -; FALLBACK19-NEXT: movl %esi, 20(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 8(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: movl %edx, (%ebp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 4(%ebp) -; FALLBACK19-NEXT: addl $92, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: lshr_32bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $108, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK20-NEXT: movzbl (%eax), %ecx -; FALLBACK20-NEXT: movl %ecx, %eax -; FALLBACK20-NEXT: shlb $3, %al -; FALLBACK20-NEXT: xorps %xmm2, %xmm2 -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: andb $28, %cl -; FALLBACK20-NEXT: movzbl %cl, %edi -; FALLBACK20-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %esi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %esi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %esi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, %ebx -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %ebp, %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %ebx -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK20-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebp, %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %edi, %ebp -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl 60(%esp,%ecx), %ebx -; FALLBACK20-NEXT: leal (%ebx,%ebx), %edi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl (%esp), %edi # 4-byte Folded Reload -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl %ebx, 28(%eax) -; FALLBACK20-NEXT: movl %esi, 4(%eax) -; FALLBACK20-NEXT: movl %edi, 24(%eax) -; FALLBACK20-NEXT: movl %ebp, 16(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 20(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 8(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 12(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, (%eax) -; FALLBACK20-NEXT: addl $108, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: lshr_32bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $108, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movups (%ecx), %xmm0 -; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK21-NEXT: movzbl (%eax), %eax -; FALLBACK21-NEXT: movl %eax, %ecx -; FALLBACK21-NEXT: shlb $3, %cl -; FALLBACK21-NEXT: xorps %xmm2, %xmm2 -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: andb $28, %al -; FALLBACK21-NEXT: movzbl %al, %ebp -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl %edi, %esi -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %esi, 4(%ebp) -; FALLBACK21-NEXT: movl %ebx, 24(%ebp) -; FALLBACK21-NEXT: shrdl %cl, %edi, %edx -; FALLBACK21-NEXT: shrl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %edx, (%ebp) -; FALLBACK21-NEXT: addl $108, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: lshr_32bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $108, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movzbl (%eax), %edx -; FALLBACK22-NEXT: movl %edx, %ecx -; FALLBACK22-NEXT: shlb $3, %cl -; FALLBACK22-NEXT: xorps %xmm2, %xmm2 -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: andb $28, %dl -; FALLBACK22-NEXT: movzbl %dl, %ebx -; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%ebx), %edx -; FALLBACK22-NEXT: movl %eax, %edi -; FALLBACK22-NEXT: notb %cl -; FALLBACK22-NEXT: movl 36(%esp,%ebx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %esi -; FALLBACK22-NEXT: shlxl %ecx, %esi, %eax -; FALLBACK22-NEXT: orl %edx, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%esp,%ebx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %edx -; FALLBACK22-NEXT: shlxl %ecx, %edx, %eax -; FALLBACK22-NEXT: movl 44(%esp,%ebx), %edx -; FALLBACK22-NEXT: shrxl %edi, %edx, %esi -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %edx, %edx -; FALLBACK22-NEXT: shlxl %ecx, %edx, %eax -; FALLBACK22-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edi, %edx, %esi -; FALLBACK22-NEXT: movl %edi, %edx -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 56(%esp,%ebx), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebp -; FALLBACK22-NEXT: shlxl %ecx, %ebp, %ebp -; FALLBACK22-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK22-NEXT: shrxl %edi, %eax, %edi -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ecx, %eax, %edi -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %esi, %eax -; FALLBACK22-NEXT: movl 60(%esp,%ebx), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebx -; FALLBACK22-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK22-NEXT: orl %eax, %ebx -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ecx, %eax, %eax -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK22-NEXT: orl %ecx, %eax -; FALLBACK22-NEXT: shrxl %edx, %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %ecx, 28(%edx) -; FALLBACK22-NEXT: movl %eax, 4(%edx) -; FALLBACK22-NEXT: movl %ebx, 24(%edx) -; FALLBACK22-NEXT: movl %edi, 16(%edx) -; FALLBACK22-NEXT: movl %ebp, 20(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 8(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 12(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, (%edx) -; FALLBACK22-NEXT: addl $108, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: lshr_32bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $108, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movups (%ecx), %xmm0 -; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK23-NEXT: movzbl (%eax), %eax -; FALLBACK23-NEXT: movl %eax, %ecx -; FALLBACK23-NEXT: shlb $3, %cl -; FALLBACK23-NEXT: xorps %xmm2, %xmm2 -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: andb $28, %al -; FALLBACK23-NEXT: movzbl %al, %ebx -; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %edi -; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi -; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl %ebx, 4(%eax) -; FALLBACK23-NEXT: movl %ebp, 24(%eax) -; FALLBACK23-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK23-NEXT: movl %ebx, 28(%eax) -; FALLBACK23-NEXT: movl %esi, 16(%eax) -; FALLBACK23-NEXT: movl %edi, 20(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: movl %esi, 8(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: movl %esi, 12(%eax) -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, (%eax) -; FALLBACK23-NEXT: addl $108, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: lshr_32bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $108, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK24-NEXT: movzbl (%eax), %ecx -; FALLBACK24-NEXT: movl %ecx, %eax -; FALLBACK24-NEXT: shlb $3, %al -; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: andb $28, %cl -; FALLBACK24-NEXT: movzbl %cl, %edi -; FALLBACK24-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %esi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %esi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %esi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, %ebx -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %ebp, %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %ebx -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK24-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebp, %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %edi, %ebp -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl 60(%esp,%ecx), %ebx -; FALLBACK24-NEXT: leal (%ebx,%ebx), %edi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl (%esp), %edi # 4-byte Folded Reload -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl %ebx, 28(%eax) -; FALLBACK24-NEXT: movl %esi, 4(%eax) -; FALLBACK24-NEXT: movl %edi, 24(%eax) -; FALLBACK24-NEXT: movl %ebp, 16(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 20(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 8(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 12(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, (%eax) -; FALLBACK24-NEXT: addl $108, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: vzeroupper -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: lshr_32bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $108, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK25-NEXT: movzbl (%eax), %eax -; FALLBACK25-NEXT: movl %eax, %ecx -; FALLBACK25-NEXT: shlb $3, %cl -; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: andb $28, %al -; FALLBACK25-NEXT: movzbl %al, %ebp -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl %edi, %esi -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %esi, 4(%ebp) -; FALLBACK25-NEXT: movl %ebx, 24(%ebp) -; FALLBACK25-NEXT: shrdl %cl, %edi, %edx -; FALLBACK25-NEXT: shrl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %edx, (%ebp) -; FALLBACK25-NEXT: addl $108, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: vzeroupper -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: lshr_32bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $108, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: movzbl (%eax), %edx -; FALLBACK26-NEXT: movl %edx, %ecx -; FALLBACK26-NEXT: shlb $3, %cl -; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: andb $28, %dl -; FALLBACK26-NEXT: movzbl %dl, %ebx -; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%ebx), %edx -; FALLBACK26-NEXT: movl %eax, %edi -; FALLBACK26-NEXT: notb %cl -; FALLBACK26-NEXT: movl 36(%esp,%ebx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %ecx, %esi, %eax -; FALLBACK26-NEXT: orl %edx, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%esp,%ebx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %edx -; FALLBACK26-NEXT: shlxl %ecx, %edx, %eax -; FALLBACK26-NEXT: movl 44(%esp,%ebx), %edx -; FALLBACK26-NEXT: shrxl %edi, %edx, %esi -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %edx, %edx -; FALLBACK26-NEXT: shlxl %ecx, %edx, %eax -; FALLBACK26-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edi, %edx, %esi -; FALLBACK26-NEXT: movl %edi, %edx -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 56(%esp,%ebx), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebp -; FALLBACK26-NEXT: shlxl %ecx, %ebp, %ebp -; FALLBACK26-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK26-NEXT: shrxl %edi, %eax, %edi -; FALLBACK26-NEXT: orl %edi, %ebp -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ecx, %eax, %edi -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: shrxl %edx, %esi, %eax -; FALLBACK26-NEXT: movl 60(%esp,%ebx), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebx -; FALLBACK26-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK26-NEXT: orl %eax, %ebx -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ecx, %eax, %eax -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK26-NEXT: orl %ecx, %eax -; FALLBACK26-NEXT: shrxl %edx, %esi, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %ecx, 28(%edx) -; FALLBACK26-NEXT: movl %eax, 4(%edx) -; FALLBACK26-NEXT: movl %ebx, 24(%edx) -; FALLBACK26-NEXT: movl %edi, 16(%edx) -; FALLBACK26-NEXT: movl %ebp, 20(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 8(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, (%edx) -; FALLBACK26-NEXT: addl $108, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: vzeroupper -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: lshr_32bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $108, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK27-NEXT: movzbl (%eax), %eax -; FALLBACK27-NEXT: movl %eax, %ecx -; FALLBACK27-NEXT: shlb $3, %cl -; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: andb $28, %al -; FALLBACK27-NEXT: movzbl %al, %ebx -; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %edi -; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi -; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl %ebx, 4(%eax) -; FALLBACK27-NEXT: movl %ebp, 24(%eax) -; FALLBACK27-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK27-NEXT: movl %ebx, 28(%eax) -; FALLBACK27-NEXT: movl %esi, 16(%eax) -; FALLBACK27-NEXT: movl %edi, 20(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: movl %esi, 8(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: movl %esi, 12(%eax) -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, (%eax) -; FALLBACK27-NEXT: addl $108, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: vzeroupper -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: lshr_32bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $108, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK28-NEXT: movzbl (%eax), %ecx -; FALLBACK28-NEXT: movl %ecx, %eax -; FALLBACK28-NEXT: shlb $3, %al -; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: andb $28, %cl -; FALLBACK28-NEXT: movzbl %cl, %edi -; FALLBACK28-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %esi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %esi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %esi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, %ebx -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %ebp, %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %ebx -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK28-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebp, %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %edi, %ebp -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl 60(%esp,%ecx), %ebx -; FALLBACK28-NEXT: leal (%ebx,%ebx), %edi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl (%esp), %edi # 4-byte Folded Reload -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl %ebx, 28(%eax) -; FALLBACK28-NEXT: movl %esi, 4(%eax) -; FALLBACK28-NEXT: movl %edi, 24(%eax) -; FALLBACK28-NEXT: movl %ebp, 16(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 20(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 8(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 12(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, (%eax) -; FALLBACK28-NEXT: addl $108, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: vzeroupper -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: lshr_32bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $108, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK29-NEXT: movzbl (%eax), %eax -; FALLBACK29-NEXT: movl %eax, %ecx -; FALLBACK29-NEXT: shlb $3, %cl -; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: andb $28, %al -; FALLBACK29-NEXT: movzbl %al, %ebp -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl %edi, %esi -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %esi, 4(%ebp) -; FALLBACK29-NEXT: movl %ebx, 24(%ebp) -; FALLBACK29-NEXT: shrdl %cl, %edi, %edx -; FALLBACK29-NEXT: shrl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %edx, (%ebp) -; FALLBACK29-NEXT: addl $108, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: vzeroupper -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: lshr_32bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $108, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: movzbl (%eax), %edx -; FALLBACK30-NEXT: movl %edx, %ecx -; FALLBACK30-NEXT: shlb $3, %cl -; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: andb $28, %dl -; FALLBACK30-NEXT: movzbl %dl, %ebx -; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%ebx), %edx -; FALLBACK30-NEXT: movl %eax, %edi -; FALLBACK30-NEXT: notb %cl -; FALLBACK30-NEXT: movl 36(%esp,%ebx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %ecx, %esi, %eax -; FALLBACK30-NEXT: orl %edx, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%esp,%ebx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %edx -; FALLBACK30-NEXT: shlxl %ecx, %edx, %eax -; FALLBACK30-NEXT: movl 44(%esp,%ebx), %edx -; FALLBACK30-NEXT: shrxl %edi, %edx, %esi -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %edx, %edx -; FALLBACK30-NEXT: shlxl %ecx, %edx, %eax -; FALLBACK30-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edi, %edx, %esi -; FALLBACK30-NEXT: movl %edi, %edx -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 56(%esp,%ebx), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebp -; FALLBACK30-NEXT: shlxl %ecx, %ebp, %ebp -; FALLBACK30-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK30-NEXT: shrxl %edi, %eax, %edi -; FALLBACK30-NEXT: orl %edi, %ebp -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ecx, %eax, %edi -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: shrxl %edx, %esi, %eax -; FALLBACK30-NEXT: movl 60(%esp,%ebx), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebx -; FALLBACK30-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK30-NEXT: orl %eax, %ebx -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ecx, %eax, %eax -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK30-NEXT: orl %ecx, %eax -; FALLBACK30-NEXT: shrxl %edx, %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %ecx, 28(%edx) -; FALLBACK30-NEXT: movl %eax, 4(%edx) -; FALLBACK30-NEXT: movl %ebx, 24(%edx) -; FALLBACK30-NEXT: movl %edi, 16(%edx) -; FALLBACK30-NEXT: movl %ebp, 20(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 8(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, (%edx) -; FALLBACK30-NEXT: addl $108, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: vzeroupper -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: lshr_32bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $108, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK31-NEXT: movzbl (%eax), %eax -; FALLBACK31-NEXT: movl %eax, %ecx -; FALLBACK31-NEXT: shlb $3, %cl -; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: andb $28, %al -; FALLBACK31-NEXT: movzbl %al, %ebx -; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %edi -; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi -; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl %ebx, 4(%eax) -; FALLBACK31-NEXT: movl %ebp, 24(%eax) -; FALLBACK31-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK31-NEXT: movl %ebx, 28(%eax) -; FALLBACK31-NEXT: movl %esi, 16(%eax) -; FALLBACK31-NEXT: movl %edi, 20(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: movl %esi, 8(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: movl %esi, 12(%eax) -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, (%eax) -; FALLBACK31-NEXT: addl $108, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: vzeroupper -; FALLBACK31-NEXT: retl +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %r9d +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %cl +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %cl, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -72(%rsp,%rcx), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rcx), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rcx), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rcx), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rcx,%rcx), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rcx, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %r9d +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ebp), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb (%eax), %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ebp), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ah, %dh +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %dh +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %ah, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%edi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%eax), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%eax), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%esi,%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb (%ecx), %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ebp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %ch, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ebx), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %bl, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%esp,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%ebp,%ebp), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, 32(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%esi), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %eax, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%esi), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 28(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 20(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ecx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ecx), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %bl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %bl, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %dl, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%esp,%ebx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edi, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%esp,%ebx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edi, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edi, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 32(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes: +; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $28, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebp, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %dl, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 36(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 44(%esp,%ebx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edi, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 40(%esp,%ebx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edi, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edi, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 44(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 40(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 32(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 36(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 %bitOff = shl i256 %byteOff, 3 @@ -5046,591 +4208,452 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { } define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: lshr_32bytes_dwordOff: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: movl %esi, %eax -; FALLBACK0-NEXT: shlb $5, %al -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $6, %sil -; FALLBACK0-NEXT: movzbl %sil, %r9d -; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r11, %r8 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %rdi, %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: movq %r9, 24(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) -; FALLBACK0-NEXT: movq %rdi, (%rdx) -; FALLBACK0-NEXT: movq %r8, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: lshr_32bytes_dwordOff: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: movl %esi, %ecx -; FALLBACK1-NEXT: shlb $5, %cl -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $6, %sil -; FALLBACK1-NEXT: movzbl %sil, %eax -; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi -; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi -; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK1-NEXT: movq %r8, %r9 -; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK1-NEXT: shrq %cl, %rax -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rax, 24(%rdx) -; FALLBACK1-NEXT: movq %rdi, (%rdx) -; FALLBACK1-NEXT: movq %r9, 8(%rdx) -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: lshr_32bytes_dwordOff: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: movl %esi, %eax -; FALLBACK2-NEXT: shlb $5, %al -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movl %eax, %ecx -; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %esi -; FALLBACK2-NEXT: movq -64(%rsp,%rsi,4), %rdi -; FALLBACK2-NEXT: movq -56(%rsp,%rsi,4), %r8 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 -; FALLBACK2-NEXT: orq %r9, %r10 -; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r9, %rdi -; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 -; FALLBACK2-NEXT: movq -48(%rsp,%rsi,4), %rsi -; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 -; FALLBACK2-NEXT: shlxq %rax, %r9, %rax -; FALLBACK2-NEXT: orq %r8, %rax -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %rcx -; FALLBACK2-NEXT: movq %rcx, 24(%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r10, 8(%rdx) -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: lshr_32bytes_dwordOff: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: movl %esi, %ecx -; FALLBACK3-NEXT: shlb $5, %cl -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $6, %sil -; FALLBACK3-NEXT: movzbl %sil, %eax -; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi -; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi -; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK3-NEXT: movq %r8, %r9 -; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rax, 24(%rdx) -; FALLBACK3-NEXT: movq %rdi, (%rdx) -; FALLBACK3-NEXT: movq %r9, 8(%rdx) -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: lshr_32bytes_dwordOff: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movzbl (%rsi), %ecx -; FALLBACK4-NEXT: movl %ecx, %eax -; FALLBACK4-NEXT: shlb $5, %al -; FALLBACK4-NEXT: xorps %xmm2, %xmm2 -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: andb $6, %cl -; FALLBACK4-NEXT: movzbl %cl, %r9d -; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK4-NEXT: movq %r10, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r11, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: addq %r10, %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r9 -; FALLBACK4-NEXT: movq %r9, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) -; FALLBACK4-NEXT: movq %rdi, (%rdx) -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: lshr_32bytes_dwordOff: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movzbl (%rsi), %eax -; FALLBACK5-NEXT: movl %eax, %ecx -; FALLBACK5-NEXT: shlb $5, %cl -; FALLBACK5-NEXT: xorps %xmm2, %xmm2 -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: andb $6, %al -; FALLBACK5-NEXT: movzbl %al, %eax -; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK5-NEXT: movq %rdi, %r8 -; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK5-NEXT: movq %rax, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK5-NEXT: shrq %cl, %rsi -; FALLBACK5-NEXT: movq %r10, 8(%rdx) -; FALLBACK5-NEXT: movq %r8, 16(%rdx) -; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: lshr_32bytes_dwordOff: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: movl %ecx, %eax -; FALLBACK6-NEXT: shlb $5, %al -; FALLBACK6-NEXT: xorps %xmm2, %xmm2 -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movl %eax, %esi -; FALLBACK6-NEXT: andb $6, %cl -; FALLBACK6-NEXT: movzbl %cl, %ecx -; FALLBACK6-NEXT: shrxq %rsi, -72(%rsp,%rcx,4), %rdi -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %r8 -; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r9 -; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 -; FALLBACK6-NEXT: orq %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rsi, %r9, %rdi -; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK6-NEXT: leaq (%rcx,%rcx), %r11 -; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 -; FALLBACK6-NEXT: orq %rdi, %r11 -; FALLBACK6-NEXT: shrxq %rsi, %r8, %rdi -; FALLBACK6-NEXT: addq %r9, %r9 -; FALLBACK6-NEXT: shlxq %rax, %r9, %rax -; FALLBACK6-NEXT: orq %rdi, %rax -; FALLBACK6-NEXT: shrxq %rsi, %rcx, %rcx -; FALLBACK6-NEXT: movq %rcx, 24(%rdx) -; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %r11, 16(%rdx) -; FALLBACK6-NEXT: movq %r10, (%rdx) -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: lshr_32bytes_dwordOff: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movzbl (%rsi), %eax -; FALLBACK7-NEXT: movl %eax, %ecx -; FALLBACK7-NEXT: shlb $5, %cl -; FALLBACK7-NEXT: xorps %xmm2, %xmm2 -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: andb $6, %al -; FALLBACK7-NEXT: movzbl %al, %eax -; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK7-NEXT: movq %rdi, %r8 -; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK7-NEXT: movq %rax, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax -; FALLBACK7-NEXT: movq %r10, 8(%rdx) -; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rax, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: lshr_32bytes_dwordOff: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: movzbl (%rsi), %ecx -; FALLBACK8-NEXT: movl %ecx, %eax -; FALLBACK8-NEXT: shlb $5, %al -; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: andb $6, %cl -; FALLBACK8-NEXT: movzbl %cl, %r9d -; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK8-NEXT: movq %r10, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r11, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %r10, %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) -; FALLBACK8-NEXT: movq %rdi, (%rdx) -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: lshr_32bytes_dwordOff: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: movzbl (%rsi), %eax -; FALLBACK9-NEXT: movl %eax, %ecx -; FALLBACK9-NEXT: shlb $5, %cl -; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: andb $6, %al -; FALLBACK9-NEXT: movzbl %al, %eax -; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK9-NEXT: movq %rdi, %r8 -; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK9-NEXT: movq %rax, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK9-NEXT: shrq %cl, %rsi -; FALLBACK9-NEXT: movq %r10, 8(%rdx) -; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: lshr_32bytes_dwordOff: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %eax -; FALLBACK10-NEXT: movl %eax, %ecx -; FALLBACK10-NEXT: shlb $5, %cl -; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movl %ecx, %esi -; FALLBACK10-NEXT: andb $6, %al -; FALLBACK10-NEXT: movzbl %al, %eax -; FALLBACK10-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi -; FALLBACK10-NEXT: notb %cl -; FALLBACK10-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK10-NEXT: movq -56(%rsp,%rax,4), %r9 -; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK10-NEXT: shlxq %rcx, %r10, %r10 -; FALLBACK10-NEXT: orq %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rsi, %r9, %rdi -; FALLBACK10-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK10-NEXT: leaq (%rax,%rax), %r11 -; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 -; FALLBACK10-NEXT: orq %rdi, %r11 -; FALLBACK10-NEXT: shrxq %rsi, %r8, %rdi -; FALLBACK10-NEXT: addq %r9, %r9 -; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx -; FALLBACK10-NEXT: orq %rdi, %rcx -; FALLBACK10-NEXT: shrxq %rsi, %rax, %rax -; FALLBACK10-NEXT: movq %rax, 24(%rdx) -; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %r11, 16(%rdx) -; FALLBACK10-NEXT: movq %r10, (%rdx) -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: lshr_32bytes_dwordOff: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: movzbl (%rsi), %eax -; FALLBACK11-NEXT: movl %eax, %ecx -; FALLBACK11-NEXT: shlb $5, %cl -; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: andb $6, %al -; FALLBACK11-NEXT: movzbl %al, %eax -; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK11-NEXT: movq %rdi, %r8 -; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK11-NEXT: movq %rax, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax -; FALLBACK11-NEXT: movq %r10, 8(%rdx) -; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rax, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: lshr_32bytes_dwordOff: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK12-NEXT: movzbl (%rsi), %ecx -; FALLBACK12-NEXT: movl %ecx, %eax -; FALLBACK12-NEXT: shlb $5, %al -; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: andb $6, %cl -; FALLBACK12-NEXT: movzbl %cl, %r9d -; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK12-NEXT: movq %r10, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r11, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %r10, %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) -; FALLBACK12-NEXT: movq %rdi, (%rdx) -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: lshr_32bytes_dwordOff: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK13-NEXT: movzbl (%rsi), %eax -; FALLBACK13-NEXT: movl %eax, %ecx -; FALLBACK13-NEXT: shlb $5, %cl -; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: andb $6, %al -; FALLBACK13-NEXT: movzbl %al, %eax -; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK13-NEXT: movq %rdi, %r8 -; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK13-NEXT: movq %rax, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK13-NEXT: shrq %cl, %rsi -; FALLBACK13-NEXT: movq %r10, 8(%rdx) -; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: lshr_32bytes_dwordOff: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %eax -; FALLBACK14-NEXT: movl %eax, %ecx -; FALLBACK14-NEXT: shlb $5, %cl -; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movl %ecx, %esi -; FALLBACK14-NEXT: andb $6, %al -; FALLBACK14-NEXT: movzbl %al, %eax -; FALLBACK14-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi -; FALLBACK14-NEXT: notb %cl -; FALLBACK14-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK14-NEXT: movq -56(%rsp,%rax,4), %r9 -; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK14-NEXT: shlxq %rcx, %r10, %r10 -; FALLBACK14-NEXT: orq %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rsi, %r9, %rdi -; FALLBACK14-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK14-NEXT: leaq (%rax,%rax), %r11 -; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 -; FALLBACK14-NEXT: orq %rdi, %r11 -; FALLBACK14-NEXT: shrxq %rsi, %r8, %rdi -; FALLBACK14-NEXT: addq %r9, %r9 -; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx -; FALLBACK14-NEXT: orq %rdi, %rcx -; FALLBACK14-NEXT: shrxq %rsi, %rax, %rax -; FALLBACK14-NEXT: movq %rax, 24(%rdx) -; FALLBACK14-NEXT: movq %rcx, 8(%rdx) -; FALLBACK14-NEXT: movq %r11, 16(%rdx) -; FALLBACK14-NEXT: movq %r10, (%rdx) -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: lshr_32bytes_dwordOff: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK15-NEXT: movzbl (%rsi), %eax -; FALLBACK15-NEXT: movl %eax, %ecx -; FALLBACK15-NEXT: shlb $5, %cl -; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: andb $6, %al -; FALLBACK15-NEXT: movzbl %al, %eax -; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK15-NEXT: movq %rdi, %r8 -; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK15-NEXT: movq %rax, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax -; FALLBACK15-NEXT: movq %r10, 8(%rdx) -; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rax, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9,4), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9,4), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9,4), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi,4), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi,4), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi,4), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %r9d +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9,4), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9,4), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %cl +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %cl, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -72(%rsp,%rcx,4), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rcx,4), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rcx,4), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rcx,4), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rcx,%rcx), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rcx, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $6, %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %r9d +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9,4), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9,4), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $6, %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq ; ; X86-SSE2-LABEL: lshr_32bytes_dwordOff: ; X86-SSE2: # %bb.0: @@ -5913,1953 +4936,1495 @@ define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no } define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: shl_32bytes: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: leal (,%rsi,8), %eax -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $24, %sil -; FALLBACK0-NEXT: negb %sil -; FALLBACK0-NEXT: movsbq %sil, %r10 -; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8 -; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq %r8, %r9 -; FALLBACK0-NEXT: shrq %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: orq %r11, %r9 -; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10 -; FALLBACK0-NEXT: movq %r10, %rbx -; FALLBACK0-NEXT: shrq %rbx -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: orq %r11, %rbx -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: shrq %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %rdi, 16(%rdx) -; FALLBACK0-NEXT: movq %rbx, 24(%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: shl_32bytes: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: leal (,%rsi,8), %ecx -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $24, %sil -; FALLBACK1-NEXT: negb %sil -; FALLBACK1-NEXT: movsbq %sil, %rax -; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK1-NEXT: shldq %cl, %rax, %rsi -; FALLBACK1-NEXT: shldq %cl, %r8, %rax -; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: shlq %cl, %r8 -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rdi, 24(%rdx) -; FALLBACK1-NEXT: movq %r8, (%rdx) -; FALLBACK1-NEXT: movq %rax, 8(%rdx) -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: shl_32bytes: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: leal (,%rsi,8), %eax -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movl %eax, %ecx -; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rdi -; FALLBACK2-NEXT: movq -40(%rsp,%rdi), %r8 -; FALLBACK2-NEXT: movq -32(%rsp,%rdi), %rsi -; FALLBACK2-NEXT: shlxq %rcx, %rsi, %r9 -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: shlxq %rcx, %r8, %r10 -; FALLBACK2-NEXT: shrq %r8 -; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 -; FALLBACK2-NEXT: orq %r9, %r8 -; FALLBACK2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rdi), %rdi -; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rcx -; FALLBACK2-NEXT: shrq %rdi -; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r9, %rdi -; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rax -; FALLBACK2-NEXT: orq %rcx, %rax -; FALLBACK2-NEXT: movq %r10, (%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rdi, 24(%rdx) -; FALLBACK2-NEXT: movq %r8, 8(%rdx) -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: shl_32bytes: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: leal (,%rsi,8), %ecx -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $24, %sil -; FALLBACK3-NEXT: negb %sil -; FALLBACK3-NEXT: movsbq %sil, %rax -; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK3-NEXT: shldq %cl, %rax, %rsi -; FALLBACK3-NEXT: shldq %cl, %r8, %rax -; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rdi, 24(%rdx) -; FALLBACK3-NEXT: movq %rcx, (%rdx) -; FALLBACK3-NEXT: movq %rax, 8(%rdx) -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: shl_32bytes: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movzbl (%rsi), %ecx -; FALLBACK4-NEXT: leal (,%rcx,8), %eax -; FALLBACK4-NEXT: xorps %xmm2, %xmm2 -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: andb $24, %cl -; FALLBACK4-NEXT: negb %cl -; FALLBACK4-NEXT: movsbq %cl, %r8 -; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r9 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10 -; FALLBACK4-NEXT: movq %r10, %rdi -; FALLBACK4-NEXT: shrq %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %rdi -; FALLBACK4-NEXT: orq %r9, %rdi -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8 -; FALLBACK4-NEXT: movq %r8, %r11 -; FALLBACK4-NEXT: shrq %r11 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: orq %r10, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r8 -; FALLBACK4-NEXT: movq %r9, %r10 -; FALLBACK4-NEXT: shrq %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r9 -; FALLBACK4-NEXT: movq %r9, (%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %r11, 16(%rdx) -; FALLBACK4-NEXT: movq %rdi, 24(%rdx) -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: shl_32bytes: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movzbl (%rsi), %eax -; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: xorps %xmm2, %xmm2 -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: andb $24, %al -; FALLBACK5-NEXT: negb %al -; FALLBACK5-NEXT: movsbq %al, %rax -; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK5-NEXT: shldq %cl, %rax, %rsi -; FALLBACK5-NEXT: movq %r8, %r9 -; FALLBACK5-NEXT: shlq %cl, %r9 -; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: shldq %cl, %r8, %rax -; FALLBACK5-NEXT: movq %rax, 8(%rdx) -; FALLBACK5-NEXT: movq %rsi, 16(%rdx) -; FALLBACK5-NEXT: movq %rdi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: shl_32bytes: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %esi -; FALLBACK6-NEXT: leal (,%rsi,8), %eax -; FALLBACK6-NEXT: xorps %xmm2, %xmm2 -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movl %eax, %ecx -; FALLBACK6-NEXT: andb $24, %sil -; FALLBACK6-NEXT: negb %sil -; FALLBACK6-NEXT: movsbq %sil, %rsi -; FALLBACK6-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK6-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK6-NEXT: shrq %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 -; FALLBACK6-NEXT: orq %rdi, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %rsi -; FALLBACK6-NEXT: shlxq %rcx, %rsi, %r10 -; FALLBACK6-NEXT: shrq %rsi -; FALLBACK6-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK6-NEXT: orq %r9, %rsi -; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rcx -; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %rcx, (%rdx) -; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rsi, 16(%rdx) -; FALLBACK6-NEXT: movq %r8, 24(%rdx) -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: shl_32bytes: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movzbl (%rsi), %eax -; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: xorps %xmm2, %xmm2 -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: andb $24, %al -; FALLBACK7-NEXT: negb %al -; FALLBACK7-NEXT: movsbq %al, %rax -; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK7-NEXT: shldq %cl, %rax, %rsi -; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shldq %cl, %r8, %rax -; FALLBACK7-NEXT: movq %rax, 8(%rdx) -; FALLBACK7-NEXT: movq %rsi, 16(%rdx) -; FALLBACK7-NEXT: movq %rdi, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: shl_32bytes: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: movzbl (%rsi), %ecx -; FALLBACK8-NEXT: leal (,%rcx,8), %eax -; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: andb $24, %cl -; FALLBACK8-NEXT: negb %cl -; FALLBACK8-NEXT: movsbq %cl, %r8 -; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r9 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10 -; FALLBACK8-NEXT: movq %r10, %rdi -; FALLBACK8-NEXT: shrq %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %rdi -; FALLBACK8-NEXT: orq %r9, %rdi -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8 -; FALLBACK8-NEXT: movq %r8, %r11 -; FALLBACK8-NEXT: shrq %r11 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: orq %r10, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r8 -; FALLBACK8-NEXT: movq %r9, %r10 -; FALLBACK8-NEXT: shrq %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, (%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %r11, 16(%rdx) -; FALLBACK8-NEXT: movq %rdi, 24(%rdx) -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: shl_32bytes: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: movzbl (%rsi), %eax -; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: andb $24, %al -; FALLBACK9-NEXT: negb %al -; FALLBACK9-NEXT: movsbq %al, %rax -; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK9-NEXT: shldq %cl, %rax, %rsi -; FALLBACK9-NEXT: movq %r8, %r9 -; FALLBACK9-NEXT: shlq %cl, %r9 -; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: shldq %cl, %r8, %rax -; FALLBACK9-NEXT: movq %rax, 8(%rdx) -; FALLBACK9-NEXT: movq %rsi, 16(%rdx) -; FALLBACK9-NEXT: movq %rdi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: shl_32bytes: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %esi -; FALLBACK10-NEXT: leal (,%rsi,8), %eax -; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movl %eax, %ecx -; FALLBACK10-NEXT: andb $24, %sil -; FALLBACK10-NEXT: negb %sil -; FALLBACK10-NEXT: movsbq %sil, %rsi -; FALLBACK10-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK10-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK10-NEXT: shrq %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 -; FALLBACK10-NEXT: orq %rdi, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %rsi -; FALLBACK10-NEXT: shlxq %rcx, %rsi, %r10 -; FALLBACK10-NEXT: shrq %rsi -; FALLBACK10-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK10-NEXT: orq %r9, %rsi -; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rcx -; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %rcx, (%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rsi, 16(%rdx) -; FALLBACK10-NEXT: movq %r8, 24(%rdx) -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: shl_32bytes: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: movzbl (%rsi), %eax -; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: andb $24, %al -; FALLBACK11-NEXT: negb %al -; FALLBACK11-NEXT: movsbq %al, %rax -; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK11-NEXT: shldq %cl, %rax, %rsi -; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shldq %cl, %r8, %rax -; FALLBACK11-NEXT: movq %rax, 8(%rdx) -; FALLBACK11-NEXT: movq %rsi, 16(%rdx) -; FALLBACK11-NEXT: movq %rdi, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: shl_32bytes: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK12-NEXT: movzbl (%rsi), %ecx -; FALLBACK12-NEXT: leal (,%rcx,8), %eax -; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: andb $24, %cl -; FALLBACK12-NEXT: negb %cl -; FALLBACK12-NEXT: movsbq %cl, %r8 -; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r9 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10 -; FALLBACK12-NEXT: movq %r10, %rdi -; FALLBACK12-NEXT: shrq %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %rdi -; FALLBACK12-NEXT: orq %r9, %rdi -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8 -; FALLBACK12-NEXT: movq %r8, %r11 -; FALLBACK12-NEXT: shrq %r11 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: orq %r10, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r8 -; FALLBACK12-NEXT: movq %r9, %r10 -; FALLBACK12-NEXT: shrq %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, (%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %r11, 16(%rdx) -; FALLBACK12-NEXT: movq %rdi, 24(%rdx) -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: shl_32bytes: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK13-NEXT: movzbl (%rsi), %eax -; FALLBACK13-NEXT: leal (,%rax,8), %ecx -; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: andb $24, %al -; FALLBACK13-NEXT: negb %al -; FALLBACK13-NEXT: movsbq %al, %rax -; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK13-NEXT: shldq %cl, %rax, %rsi -; FALLBACK13-NEXT: movq %r8, %r9 -; FALLBACK13-NEXT: shlq %cl, %r9 -; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: shldq %cl, %r8, %rax -; FALLBACK13-NEXT: movq %rax, 8(%rdx) -; FALLBACK13-NEXT: movq %rsi, 16(%rdx) -; FALLBACK13-NEXT: movq %rdi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: shl_32bytes: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %esi -; FALLBACK14-NEXT: leal (,%rsi,8), %eax -; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movl %eax, %ecx -; FALLBACK14-NEXT: andb $24, %sil -; FALLBACK14-NEXT: negb %sil -; FALLBACK14-NEXT: movsbq %sil, %rsi -; FALLBACK14-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK14-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK14-NEXT: shrq %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 -; FALLBACK14-NEXT: orq %rdi, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %rsi -; FALLBACK14-NEXT: shlxq %rcx, %rsi, %r10 -; FALLBACK14-NEXT: shrq %rsi -; FALLBACK14-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK14-NEXT: orq %r9, %rsi -; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rcx -; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %rcx, (%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rsi, 16(%rdx) -; FALLBACK14-NEXT: movq %r8, 24(%rdx) -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: shl_32bytes: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK15-NEXT: movzbl (%rsi), %eax -; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: andb $24, %al -; FALLBACK15-NEXT: negb %al -; FALLBACK15-NEXT: movsbq %al, %rax -; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK15-NEXT: shldq %cl, %rax, %rsi -; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shldq %cl, %r8, %rax -; FALLBACK15-NEXT: movq %rax, 8(%rdx) -; FALLBACK15-NEXT: movq %rsi, 16(%rdx) -; FALLBACK15-NEXT: movq %rdi, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq -; -; FALLBACK16-LABEL: shl_32bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $108, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl (%ecx), %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ecx), %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%ecx), %esi -; FALLBACK16-NEXT: movl 12(%ecx), %edi -; FALLBACK16-NEXT: movl 16(%ecx), %ebx -; FALLBACK16-NEXT: movb (%eax), %ah -; FALLBACK16-NEXT: movl 20(%ecx), %ebp -; FALLBACK16-NEXT: movl 24(%ecx), %edx -; FALLBACK16-NEXT: movl 28(%ecx), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movb %ah, %ch -; FALLBACK16-NEXT: shlb $3, %ch -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $28, %ah -; FALLBACK16-NEXT: negb %ah -; FALLBACK16-NEXT: movsbl %ah, %ebx -; FALLBACK16-NEXT: movl 64(%esp,%ebx), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 68(%esp,%ebx), %eax -; FALLBACK16-NEXT: movl %eax, %esi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movb %ch, %dl -; FALLBACK16-NEXT: notb %dl -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 76(%esp,%ebx), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movl 72(%esp,%ebx), %esi -; FALLBACK16-NEXT: movl %esi, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: shrl %eax -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: orl %esi, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 84(%esp,%ebx), %esi -; FALLBACK16-NEXT: movl %esi, %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: movl 80(%esp,%ebx), %edi -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %eax -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: orl %edi, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 92(%esp,%ebx), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: movl 88(%esp,%ebx), %edi -; FALLBACK16-NEXT: movl %edi, %ebx -; FALLBACK16-NEXT: shrl %ebx -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: orl %eax, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: shrl %esi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: orl %edi, %esi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %edx, (%eax) -; FALLBACK16-NEXT: movl %esi, 24(%eax) -; FALLBACK16-NEXT: movl %ebx, 28(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 16(%eax) -; FALLBACK16-NEXT: movl %ebp, 20(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 8(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 12(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $108, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: shl_32bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $92, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl (%eax), %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%eax), %edx -; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%eax), %esi -; FALLBACK17-NEXT: movl 12(%eax), %edi -; FALLBACK17-NEXT: movl 16(%eax), %ebx -; FALLBACK17-NEXT: movb (%ecx), %ch -; FALLBACK17-NEXT: movl 20(%eax), %ebp -; FALLBACK17-NEXT: movl 24(%eax), %edx -; FALLBACK17-NEXT: movl 28(%eax), %eax -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movb %ch, %cl -; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: andb $28, %ch -; FALLBACK17-NEXT: negb %ch -; FALLBACK17-NEXT: movsbl %ch, %eax -; FALLBACK17-NEXT: movl 56(%esp,%eax), %edx -; FALLBACK17-NEXT: movl 60(%esp,%eax), %ebx -; FALLBACK17-NEXT: movl %ebx, %esi -; FALLBACK17-NEXT: shldl %cl, %edx, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 52(%esp,%eax), %esi -; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 64(%esp,%eax), %edi -; FALLBACK17-NEXT: movl 68(%esp,%eax), %ebp -; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %edi, %ebp -; FALLBACK17-NEXT: shldl %cl, %ebx, %edi -; FALLBACK17-NEXT: movl 48(%esp,%eax), %ebx -; FALLBACK17-NEXT: movl 72(%esp,%eax), %edx -; FALLBACK17-NEXT: movl 76(%esp,%eax), %esi -; FALLBACK17-NEXT: shldl %cl, %edx, %esi -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: shldl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl %edx, 24(%eax) -; FALLBACK17-NEXT: movl %esi, 28(%eax) -; FALLBACK17-NEXT: movl %edi, 16(%eax) -; FALLBACK17-NEXT: movl %ebp, 20(%eax) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, 8(%eax) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, 12(%eax) -; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK17-NEXT: shldl %cl, %ebx, %edx -; FALLBACK17-NEXT: shll %cl, %ebx -; FALLBACK17-NEXT: movl %ebx, (%eax) -; FALLBACK17-NEXT: movl %edx, 4(%eax) -; FALLBACK17-NEXT: addl $92, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: shl_32bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $108, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %esi -; FALLBACK18-NEXT: movl 12(%eax), %edi -; FALLBACK18-NEXT: movl 16(%eax), %ebp -; FALLBACK18-NEXT: movzbl (%ebx), %ebx -; FALLBACK18-NEXT: movl 20(%eax), %edx -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl 28(%eax), %eax -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, %edx -; FALLBACK18-NEXT: shlb $3, %dl -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, %eax -; FALLBACK18-NEXT: movl %eax, %ebp -; FALLBACK18-NEXT: andb $28, %bl -; FALLBACK18-NEXT: negb %bl -; FALLBACK18-NEXT: movsbl %bl, %esi -; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 68(%esp,%esi), %ecx -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %eax, %ecx, %edi -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %edx, %ebx, %ebx -; FALLBACK18-NEXT: orl %edi, %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 72(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl %ebx, %edi -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %eax -; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi -; FALLBACK18-NEXT: movl %ebp, %esi -; FALLBACK18-NEXT: shlxl %ebp, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %esi, %ebx, %ebx -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: orl %ebx, %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK18-NEXT: movl 80(%esp,%ebp), %ecx -; FALLBACK18-NEXT: movl %ecx, %ebx -; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %edx, %ebx, %eax -; FALLBACK18-NEXT: movl 84(%esp,%ebp), %ebx -; FALLBACK18-NEXT: shlxl %esi, %ebx, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %esi, %ecx, %ecx -; FALLBACK18-NEXT: movl %esi, %eax -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %ecx, %edi -; FALLBACK18-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: shlxl %esi, 92(%esp,%ecx), %ebp -; FALLBACK18-NEXT: movl 88(%esp,%ecx), %esi -; FALLBACK18-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi -; FALLBACK18-NEXT: orl %ebp, %esi -; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %edx, %ebx, %eax -; FALLBACK18-NEXT: orl %ecx, %eax -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%edx) -; FALLBACK18-NEXT: movl %eax, 24(%edx) -; FALLBACK18-NEXT: movl %esi, 28(%edx) -; FALLBACK18-NEXT: movl %edi, 16(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 20(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 8(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 12(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 4(%edx) -; FALLBACK18-NEXT: addl $108, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: shl_32bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $92, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ecx), %esi -; FALLBACK19-NEXT: movl 12(%ecx), %edi -; FALLBACK19-NEXT: movl 16(%ecx), %ebp -; FALLBACK19-NEXT: movzbl (%ebx), %ebx -; FALLBACK19-NEXT: movl 20(%ecx), %edx -; FALLBACK19-NEXT: movl 24(%ecx), %eax -; FALLBACK19-NEXT: movl 28(%ecx), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, %ecx -; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: andb $28, %bl -; FALLBACK19-NEXT: negb %bl -; FALLBACK19-NEXT: movsbl %bl, %eax -; FALLBACK19-NEXT: movl 56(%esp,%eax), %edx -; FALLBACK19-NEXT: movl 60(%esp,%eax), %esi -; FALLBACK19-NEXT: movl %esi, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %edx, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 52(%esp,%eax), %ebx -; FALLBACK19-NEXT: shldl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 64(%esp,%eax), %edi -; FALLBACK19-NEXT: movl 68(%esp,%eax), %ebp -; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %edi, %ebp -; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: movl 48(%esp,%eax), %edx -; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 72(%esp,%eax), %edx -; FALLBACK19-NEXT: movl 76(%esp,%eax), %esi -; FALLBACK19-NEXT: shldl %cl, %edx, %esi -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: shldl %cl, %eax, %edx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: movl %edx, 24(%eax) -; FALLBACK19-NEXT: movl %esi, 28(%eax) -; FALLBACK19-NEXT: movl %edi, 16(%eax) -; FALLBACK19-NEXT: movl %ebp, 20(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, 8(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, 12(%eax) -; FALLBACK19-NEXT: movl (%esp), %esi # 4-byte Reload -; FALLBACK19-NEXT: shlxl %ecx, %esi, %edx -; FALLBACK19-NEXT: movl %edx, (%eax) -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: shldl %cl, %esi, %ebx -; FALLBACK19-NEXT: movl %ebx, 4(%eax) -; FALLBACK19-NEXT: addl $92, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: shl_32bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $108, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK20-NEXT: movzbl (%eax), %ecx -; FALLBACK20-NEXT: movb %cl, %dh -; FALLBACK20-NEXT: shlb $3, %dh -; FALLBACK20-NEXT: xorps %xmm2, %xmm2 -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: andb $28, %cl -; FALLBACK20-NEXT: negb %cl -; FALLBACK20-NEXT: movsbl %cl, %ebx -; FALLBACK20-NEXT: movl 84(%esp,%ebx), %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: movb %dh, %dl -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: movl 80(%esp,%ebx), %esi -; FALLBACK20-NEXT: movl %esi, %eax -; FALLBACK20-NEXT: shrl %eax -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: orl %edi, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl %ebx, %edi -; FALLBACK20-NEXT: movl 76(%esp,%ebx), %ebp -; FALLBACK20-NEXT: movl %ebp, %eax -; FALLBACK20-NEXT: shrl %eax -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: orl %esi, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: movl 72(%esp,%ebx), %ebx -; FALLBACK20-NEXT: movl %ebx, %eax -; FALLBACK20-NEXT: shrl %eax -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 68(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %esi -; FALLBACK20-NEXT: shrl %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: movl 64(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: shrl %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl 88(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %edi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shrl %eax -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: orl %edi, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl 92(%esp,%eax), %edi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %edi, %ebp -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl %edx, (%eax) -; FALLBACK20-NEXT: movl %ebp, 28(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 24(%eax) -; FALLBACK20-NEXT: movl %ebx, 4(%eax) -; FALLBACK20-NEXT: movl %esi, 8(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 12(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 16(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 20(%eax) -; FALLBACK20-NEXT: addl $108, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: shl_32bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $92, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movups (%ecx), %xmm0 -; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK21-NEXT: movzbl (%eax), %eax -; FALLBACK21-NEXT: movl %eax, %ecx -; FALLBACK21-NEXT: shlb $3, %cl -; FALLBACK21-NEXT: xorps %xmm2, %xmm2 -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: andb $28, %al -; FALLBACK21-NEXT: negb %al -; FALLBACK21-NEXT: movsbl %al, %ebp -; FALLBACK21-NEXT: movl 64(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl 68(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %edx -; FALLBACK21-NEXT: shldl %cl, %edx, %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edi -; FALLBACK21-NEXT: shldl %cl, %edi, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %ebx -; FALLBACK21-NEXT: shldl %cl, %ebx, %edi -; FALLBACK21-NEXT: movl 72(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl %edx, %eax -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK21-NEXT: shldl %cl, %esi, %eax -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 76(%esp,%ebp), %ebp -; FALLBACK21-NEXT: shldl %cl, %edx, %ebp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK21-NEXT: movl %ebp, 28(%edx) -; FALLBACK21-NEXT: movl %eax, 24(%edx) -; FALLBACK21-NEXT: movl %esi, %eax -; FALLBACK21-NEXT: shll %cl, %eax -; FALLBACK21-NEXT: shldl %cl, %esi, %ebx -; FALLBACK21-NEXT: movl %ebx, 4(%edx) -; FALLBACK21-NEXT: movl %edi, 8(%edx) -; FALLBACK21-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK21-NEXT: movl %ecx, 12(%edx) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK21-NEXT: movl %ecx, 16(%edx) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK21-NEXT: movl %ecx, 20(%edx) -; FALLBACK21-NEXT: movl %eax, (%edx) -; FALLBACK21-NEXT: addl $92, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: shl_32bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $108, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movzbl (%eax), %edx -; FALLBACK22-NEXT: movl %edx, %ecx -; FALLBACK22-NEXT: shlb $3, %cl -; FALLBACK22-NEXT: xorps %xmm2, %xmm2 -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, %ebx -; FALLBACK22-NEXT: andb $28, %dl -; FALLBACK22-NEXT: negb %dl -; FALLBACK22-NEXT: movsbl %dl, %edx -; FALLBACK22-NEXT: movl 84(%esp,%edx), %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %esi -; FALLBACK22-NEXT: notb %cl -; FALLBACK22-NEXT: movl 80(%esp,%edx), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %ebp -; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %esi, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 76(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, %edi -; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %ebp, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: movl 72(%esp,%edx), %edi -; FALLBACK22-NEXT: movl %edi, %ebp -; FALLBACK22-NEXT: shrl %ebp -; FALLBACK22-NEXT: shrxl %ecx, %ebp, %ebp -; FALLBACK22-NEXT: orl %esi, %ebp -; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %ebx, %edi, %esi -; FALLBACK22-NEXT: movl 68(%esp,%edx), %ebp -; FALLBACK22-NEXT: movl %ebp, %edi -; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %esi, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi -; FALLBACK22-NEXT: orl %ebp, %edi -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ecx, %eax, %esi -; FALLBACK22-NEXT: movl 88(%esp,%edx), %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %ebp -; FALLBACK22-NEXT: orl %ebp, %esi -; FALLBACK22-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: shlxl %ebx, 92(%esp,%edx), %edx -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ecx, %eax, %eax -; FALLBACK22-NEXT: orl %edx, %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movl %ebp, (%ecx) -; FALLBACK22-NEXT: movl %eax, 28(%ecx) -; FALLBACK22-NEXT: movl %esi, 24(%ecx) -; FALLBACK22-NEXT: movl %edi, 4(%ecx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 8(%ecx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 12(%ecx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 16(%ecx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%ecx) -; FALLBACK22-NEXT: addl $108, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: shl_32bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $92, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movups (%ecx), %xmm0 -; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK23-NEXT: movzbl (%eax), %eax -; FALLBACK23-NEXT: movl %eax, %ecx -; FALLBACK23-NEXT: shlb $3, %cl -; FALLBACK23-NEXT: xorps %xmm2, %xmm2 -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: andb $28, %al -; FALLBACK23-NEXT: negb %al -; FALLBACK23-NEXT: movsbl %al, %ebx -; FALLBACK23-NEXT: movl 64(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl 68(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %eax, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 60(%esp,%ebx), %edx -; FALLBACK23-NEXT: shldl %cl, %edx, %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 56(%esp,%ebx), %edi -; FALLBACK23-NEXT: shldl %cl, %edi, %edx -; FALLBACK23-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK23-NEXT: movl 52(%esp,%ebx), %ebp -; FALLBACK23-NEXT: shldl %cl, %ebp, %edi -; FALLBACK23-NEXT: movl 72(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl %edx, %eax -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: shldl %cl, %esi, %eax -; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK23-NEXT: movl 76(%esp,%ebx), %ebx -; FALLBACK23-NEXT: shldl %cl, %edx, %ebx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK23-NEXT: movl %ebx, 28(%edx) -; FALLBACK23-NEXT: movl %eax, 24(%edx) -; FALLBACK23-NEXT: shlxl %ecx, %esi, %eax -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: shldl %cl, %esi, %ebp -; FALLBACK23-NEXT: movl %ebp, 4(%edx) -; FALLBACK23-NEXT: movl %edi, 8(%edx) -; FALLBACK23-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 12(%edx) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 16(%edx) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 20(%edx) -; FALLBACK23-NEXT: movl %eax, (%edx) -; FALLBACK23-NEXT: addl $92, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: shl_32bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $108, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK24-NEXT: movzbl (%eax), %ecx -; FALLBACK24-NEXT: movb %cl, %dh -; FALLBACK24-NEXT: shlb $3, %dh -; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: andb $28, %cl -; FALLBACK24-NEXT: negb %cl -; FALLBACK24-NEXT: movsbl %cl, %ebx -; FALLBACK24-NEXT: movl 84(%esp,%ebx), %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: movb %dh, %dl -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: movl 80(%esp,%ebx), %esi -; FALLBACK24-NEXT: movl %esi, %eax -; FALLBACK24-NEXT: shrl %eax -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: orl %edi, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl %ebx, %edi -; FALLBACK24-NEXT: movl 76(%esp,%ebx), %ebp -; FALLBACK24-NEXT: movl %ebp, %eax -; FALLBACK24-NEXT: shrl %eax -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: orl %esi, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: movl 72(%esp,%ebx), %ebx -; FALLBACK24-NEXT: movl %ebx, %eax -; FALLBACK24-NEXT: shrl %eax -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 68(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %esi -; FALLBACK24-NEXT: shrl %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: movl 64(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: shrl %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl 88(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %edi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: shrl %eax -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: orl %edi, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl 92(%esp,%eax), %edi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %edi, %ebp -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl %edx, (%eax) -; FALLBACK24-NEXT: movl %ebp, 28(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 24(%eax) -; FALLBACK24-NEXT: movl %ebx, 4(%eax) -; FALLBACK24-NEXT: movl %esi, 8(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 12(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 16(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 20(%eax) -; FALLBACK24-NEXT: addl $108, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: vzeroupper -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: shl_32bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $92, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK25-NEXT: movzbl (%eax), %eax -; FALLBACK25-NEXT: movl %eax, %ecx -; FALLBACK25-NEXT: shlb $3, %cl -; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: andb $28, %al -; FALLBACK25-NEXT: negb %al -; FALLBACK25-NEXT: movsbl %al, %ebp -; FALLBACK25-NEXT: movl 64(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl 68(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %edx -; FALLBACK25-NEXT: shldl %cl, %edx, %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edi -; FALLBACK25-NEXT: shldl %cl, %edi, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %ebx -; FALLBACK25-NEXT: shldl %cl, %ebx, %edi -; FALLBACK25-NEXT: movl 72(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl %edx, %eax -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK25-NEXT: shldl %cl, %esi, %eax -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 76(%esp,%ebp), %ebp -; FALLBACK25-NEXT: shldl %cl, %edx, %ebp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK25-NEXT: movl %ebp, 28(%edx) -; FALLBACK25-NEXT: movl %eax, 24(%edx) -; FALLBACK25-NEXT: movl %esi, %eax -; FALLBACK25-NEXT: shll %cl, %eax -; FALLBACK25-NEXT: shldl %cl, %esi, %ebx -; FALLBACK25-NEXT: movl %ebx, 4(%edx) -; FALLBACK25-NEXT: movl %edi, 8(%edx) -; FALLBACK25-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK25-NEXT: movl %ecx, 12(%edx) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK25-NEXT: movl %ecx, 16(%edx) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK25-NEXT: movl %ecx, 20(%edx) -; FALLBACK25-NEXT: movl %eax, (%edx) -; FALLBACK25-NEXT: addl $92, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: vzeroupper -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: shl_32bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $108, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: movzbl (%eax), %edx -; FALLBACK26-NEXT: movl %edx, %eax -; FALLBACK26-NEXT: shlb $3, %al -; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %eax, %ebx -; FALLBACK26-NEXT: andb $28, %dl -; FALLBACK26-NEXT: negb %dl -; FALLBACK26-NEXT: movsbl %dl, %edx -; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK26-NEXT: shlxl %ebx, %ecx, %esi -; FALLBACK26-NEXT: notb %al -; FALLBACK26-NEXT: movl 80(%esp,%edx), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %eax, %edi, %edi -; FALLBACK26-NEXT: orl %esi, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 76(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, %edi -; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %eax, %edi, %edi -; FALLBACK26-NEXT: orl %ebp, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: movl 72(%esp,%edx), %edi -; FALLBACK26-NEXT: movl %edi, %ebp -; FALLBACK26-NEXT: shrl %ebp -; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK26-NEXT: orl %esi, %ebp -; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %ebx, %edi, %esi -; FALLBACK26-NEXT: movl 68(%esp,%edx), %ebp -; FALLBACK26-NEXT: movl %ebp, %edi -; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %eax, %edi, %edi -; FALLBACK26-NEXT: orl %esi, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK26-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %eax, %esi, %edi -; FALLBACK26-NEXT: orl %ebp, %edi -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %eax, %ecx, %esi -; FALLBACK26-NEXT: movl 88(%esp,%edx), %ecx -; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebp -; FALLBACK26-NEXT: orl %ebp, %esi -; FALLBACK26-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: shlxl %ebx, 92(%esp,%edx), %edx -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %eax, %ecx, %eax -; FALLBACK26-NEXT: orl %edx, %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: movl %ebp, (%ecx) -; FALLBACK26-NEXT: movl %eax, 28(%ecx) -; FALLBACK26-NEXT: movl %esi, 24(%ecx) -; FALLBACK26-NEXT: movl %edi, 4(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 8(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 16(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%ecx) -; FALLBACK26-NEXT: addl $108, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: vzeroupper -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: shl_32bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $92, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK27-NEXT: movzbl (%eax), %eax -; FALLBACK27-NEXT: movl %eax, %ecx -; FALLBACK27-NEXT: shlb $3, %cl -; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: andb $28, %al -; FALLBACK27-NEXT: negb %al -; FALLBACK27-NEXT: movsbl %al, %ebx -; FALLBACK27-NEXT: movl 64(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl 68(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %eax, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 60(%esp,%ebx), %edx -; FALLBACK27-NEXT: shldl %cl, %edx, %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 56(%esp,%ebx), %edi -; FALLBACK27-NEXT: shldl %cl, %edi, %edx -; FALLBACK27-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK27-NEXT: movl 52(%esp,%ebx), %ebp -; FALLBACK27-NEXT: shldl %cl, %ebp, %edi -; FALLBACK27-NEXT: movl 72(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl %edx, %eax -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: shldl %cl, %esi, %eax -; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK27-NEXT: movl 76(%esp,%ebx), %ebx -; FALLBACK27-NEXT: shldl %cl, %edx, %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK27-NEXT: movl %ebx, 28(%edx) -; FALLBACK27-NEXT: movl %eax, 24(%edx) -; FALLBACK27-NEXT: shlxl %ecx, %esi, %eax -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: shldl %cl, %esi, %ebp -; FALLBACK27-NEXT: movl %ebp, 4(%edx) -; FALLBACK27-NEXT: movl %edi, 8(%edx) -; FALLBACK27-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 12(%edx) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 16(%edx) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 20(%edx) -; FALLBACK27-NEXT: movl %eax, (%edx) -; FALLBACK27-NEXT: addl $92, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: vzeroupper -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: shl_32bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $108, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK28-NEXT: movzbl (%eax), %ecx -; FALLBACK28-NEXT: movb %cl, %dh -; FALLBACK28-NEXT: shlb $3, %dh -; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: andb $28, %cl -; FALLBACK28-NEXT: negb %cl -; FALLBACK28-NEXT: movsbl %cl, %ebx -; FALLBACK28-NEXT: movl 84(%esp,%ebx), %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: movb %dh, %dl -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: movl 80(%esp,%ebx), %esi -; FALLBACK28-NEXT: movl %esi, %eax -; FALLBACK28-NEXT: shrl %eax -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: orl %edi, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl %ebx, %edi -; FALLBACK28-NEXT: movl 76(%esp,%ebx), %ebp -; FALLBACK28-NEXT: movl %ebp, %eax -; FALLBACK28-NEXT: shrl %eax -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: orl %esi, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: movl 72(%esp,%ebx), %ebx -; FALLBACK28-NEXT: movl %ebx, %eax -; FALLBACK28-NEXT: shrl %eax -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 68(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %esi -; FALLBACK28-NEXT: shrl %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: movl 64(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: shrl %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl 88(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %edi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: shrl %eax -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: orl %edi, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl 92(%esp,%eax), %edi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %edi, %ebp -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl %edx, (%eax) -; FALLBACK28-NEXT: movl %ebp, 28(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 24(%eax) -; FALLBACK28-NEXT: movl %ebx, 4(%eax) -; FALLBACK28-NEXT: movl %esi, 8(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 12(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 16(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 20(%eax) -; FALLBACK28-NEXT: addl $108, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: vzeroupper -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: shl_32bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $92, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK29-NEXT: movzbl (%eax), %eax -; FALLBACK29-NEXT: movl %eax, %ecx -; FALLBACK29-NEXT: shlb $3, %cl -; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: andb $28, %al -; FALLBACK29-NEXT: negb %al -; FALLBACK29-NEXT: movsbl %al, %ebp -; FALLBACK29-NEXT: movl 64(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl 68(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %edx -; FALLBACK29-NEXT: shldl %cl, %edx, %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edi -; FALLBACK29-NEXT: shldl %cl, %edi, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %ebx -; FALLBACK29-NEXT: shldl %cl, %ebx, %edi -; FALLBACK29-NEXT: movl 72(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl %edx, %eax -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK29-NEXT: shldl %cl, %esi, %eax -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 76(%esp,%ebp), %ebp -; FALLBACK29-NEXT: shldl %cl, %edx, %ebp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK29-NEXT: movl %ebp, 28(%edx) -; FALLBACK29-NEXT: movl %eax, 24(%edx) -; FALLBACK29-NEXT: movl %esi, %eax -; FALLBACK29-NEXT: shll %cl, %eax -; FALLBACK29-NEXT: shldl %cl, %esi, %ebx -; FALLBACK29-NEXT: movl %ebx, 4(%edx) -; FALLBACK29-NEXT: movl %edi, 8(%edx) -; FALLBACK29-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK29-NEXT: movl %ecx, 12(%edx) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK29-NEXT: movl %ecx, 16(%edx) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK29-NEXT: movl %ecx, 20(%edx) -; FALLBACK29-NEXT: movl %eax, (%edx) -; FALLBACK29-NEXT: addl $92, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: vzeroupper -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: shl_32bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $108, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: movzbl (%eax), %edx -; FALLBACK30-NEXT: movl %edx, %eax -; FALLBACK30-NEXT: shlb $3, %al -; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %eax, %ebx -; FALLBACK30-NEXT: andb $28, %dl -; FALLBACK30-NEXT: negb %dl -; FALLBACK30-NEXT: movsbl %dl, %edx -; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK30-NEXT: shlxl %ebx, %ecx, %esi -; FALLBACK30-NEXT: notb %al -; FALLBACK30-NEXT: movl 80(%esp,%edx), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %eax, %edi, %edi -; FALLBACK30-NEXT: orl %esi, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 76(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, %edi -; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %eax, %edi, %edi -; FALLBACK30-NEXT: orl %ebp, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: movl 72(%esp,%edx), %edi -; FALLBACK30-NEXT: movl %edi, %ebp -; FALLBACK30-NEXT: shrl %ebp -; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK30-NEXT: orl %esi, %ebp -; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %ebx, %edi, %esi -; FALLBACK30-NEXT: movl 68(%esp,%edx), %ebp -; FALLBACK30-NEXT: movl %ebp, %edi -; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %eax, %edi, %edi -; FALLBACK30-NEXT: orl %esi, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %eax, %esi, %edi -; FALLBACK30-NEXT: orl %ebp, %edi -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %eax, %ecx, %esi -; FALLBACK30-NEXT: movl 88(%esp,%edx), %ecx -; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %esi -; FALLBACK30-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: shlxl %ebx, 92(%esp,%edx), %edx -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %eax, %ecx, %eax -; FALLBACK30-NEXT: orl %edx, %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: movl %ebp, (%ecx) -; FALLBACK30-NEXT: movl %eax, 28(%ecx) -; FALLBACK30-NEXT: movl %esi, 24(%ecx) -; FALLBACK30-NEXT: movl %edi, 4(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 8(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 16(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%ecx) -; FALLBACK30-NEXT: addl $108, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: vzeroupper -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: shl_32bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $92, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK31-NEXT: movzbl (%eax), %eax -; FALLBACK31-NEXT: movl %eax, %ecx -; FALLBACK31-NEXT: shlb $3, %cl -; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: andb $28, %al -; FALLBACK31-NEXT: negb %al -; FALLBACK31-NEXT: movsbl %al, %ebx -; FALLBACK31-NEXT: movl 64(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl 68(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %eax, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 60(%esp,%ebx), %edx -; FALLBACK31-NEXT: shldl %cl, %edx, %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 56(%esp,%ebx), %edi -; FALLBACK31-NEXT: shldl %cl, %edi, %edx -; FALLBACK31-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK31-NEXT: movl 52(%esp,%ebx), %ebp -; FALLBACK31-NEXT: shldl %cl, %ebp, %edi -; FALLBACK31-NEXT: movl 72(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl %edx, %eax -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: shldl %cl, %esi, %eax -; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK31-NEXT: movl 76(%esp,%ebx), %ebx -; FALLBACK31-NEXT: shldl %cl, %edx, %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK31-NEXT: movl %ebx, 28(%edx) -; FALLBACK31-NEXT: movl %eax, 24(%edx) -; FALLBACK31-NEXT: shlxl %ecx, %esi, %eax -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: shldl %cl, %esi, %ebp -; FALLBACK31-NEXT: movl %ebp, 4(%edx) -; FALLBACK31-NEXT: movl %edi, 8(%edx) -; FALLBACK31-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 12(%edx) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 16(%edx) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 20(%edx) -; FALLBACK31-NEXT: movl %eax, (%edx) -; FALLBACK31-NEXT: addl $92, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: vzeroupper -; FALLBACK31-NEXT: retl +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%r10), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%r10), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%r10), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%r10), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rdi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rsi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rsi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rcx, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %sil, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rsi, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rsi, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r9, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movsbq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%r8), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%r8), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %sil, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rsi, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rsi, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r9, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r10, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb (%eax), %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ah, %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbl %ah, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%ebx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%ebx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb (%ecx), %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbl %ch, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%eax), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 24(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 28(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 16(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 20(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 8(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ebx), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbl %bl, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 64(%esp,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%esi), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, 92(%esp,%ecx), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ecx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ecx), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %bl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %bl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbl %bl, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 64(%esp,%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%eax), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %cl, %dh +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %dh +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %dl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%ebx), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%ebx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%eax), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %al +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 28(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 24(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 4(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 8(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, (%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbl %dl, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%edx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%edx), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 64(%esp,%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%edx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, (%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 24(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 4(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 16(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 20(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbl %al, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 64(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 28(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 24(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %esi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 4(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 8(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 16(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 20(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, (%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes: +; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %cl, %dh +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %dh +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $28, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movsbl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %dl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%ebx), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%ebx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%eax), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %al +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %esi, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %edx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 28(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 24(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 4(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 8(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 16(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, (%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbl %dl, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%edx), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 64(%esp,%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, (%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 24(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 4(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 16(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 20(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %al +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbl %al, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 64(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %esi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 28(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 24(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %esi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 4(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 8(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 16(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 20(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, (%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 %bitOff = shl i256 %byteOff, 3 @@ -7869,617 +6434,472 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { } define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: shl_32bytes_dwordOff: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: movl %esi, %eax -; FALLBACK0-NEXT: shlb $5, %al -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: shlb $2, %sil -; FALLBACK0-NEXT: andb $24, %sil -; FALLBACK0-NEXT: negb %sil -; FALLBACK0-NEXT: movsbq %sil, %r10 -; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8 -; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq %r8, %r9 -; FALLBACK0-NEXT: shrq %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: orq %r11, %r9 -; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10 -; FALLBACK0-NEXT: movq %r10, %rbx -; FALLBACK0-NEXT: shrq %rbx -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: orq %r11, %rbx -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: shrq %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %rdi, 16(%rdx) -; FALLBACK0-NEXT: movq %rbx, 24(%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: shl_32bytes_dwordOff: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: movl %esi, %ecx -; FALLBACK1-NEXT: shlb $5, %cl -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: shlb $2, %sil -; FALLBACK1-NEXT: andb $24, %sil -; FALLBACK1-NEXT: negb %sil -; FALLBACK1-NEXT: movsbq %sil, %rax -; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK1-NEXT: shldq %cl, %rax, %rsi -; FALLBACK1-NEXT: shldq %cl, %r8, %rax -; FALLBACK1-NEXT: shlq %cl, %r8 -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rdi, 24(%rdx) -; FALLBACK1-NEXT: movq %r8, (%rdx) -; FALLBACK1-NEXT: movq %rax, 8(%rdx) -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: shl_32bytes_dwordOff: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: movl %esi, %eax -; FALLBACK2-NEXT: shlb $5, %al -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movl %eax, %ecx -; FALLBACK2-NEXT: shlb $2, %sil -; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rdi -; FALLBACK2-NEXT: movq -40(%rsp,%rdi), %r8 -; FALLBACK2-NEXT: movq -32(%rsp,%rdi), %rsi -; FALLBACK2-NEXT: shlxq %rcx, %rsi, %r9 -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: shlxq %rcx, %r8, %r10 -; FALLBACK2-NEXT: shrq %r8 -; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 -; FALLBACK2-NEXT: orq %r9, %r8 -; FALLBACK2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rdi), %rdi -; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rcx -; FALLBACK2-NEXT: shrq %rdi -; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r9, %rdi -; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rax -; FALLBACK2-NEXT: orq %rcx, %rax -; FALLBACK2-NEXT: movq %r10, (%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rdi, 24(%rdx) -; FALLBACK2-NEXT: movq %r8, 8(%rdx) -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: shl_32bytes_dwordOff: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: movl %esi, %ecx -; FALLBACK3-NEXT: shlb $5, %cl -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: shlb $2, %sil -; FALLBACK3-NEXT: andb $24, %sil -; FALLBACK3-NEXT: negb %sil -; FALLBACK3-NEXT: movsbq %sil, %rax -; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK3-NEXT: shldq %cl, %rax, %rsi -; FALLBACK3-NEXT: shldq %cl, %r8, %rax -; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rdi, 24(%rdx) -; FALLBACK3-NEXT: movq %rcx, (%rdx) -; FALLBACK3-NEXT: movq %rax, 8(%rdx) -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: shl_32bytes_dwordOff: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movzbl (%rsi), %ecx -; FALLBACK4-NEXT: movl %ecx, %eax -; FALLBACK4-NEXT: shlb $5, %al -; FALLBACK4-NEXT: xorps %xmm2, %xmm2 -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: shlb $2, %cl -; FALLBACK4-NEXT: andb $24, %cl -; FALLBACK4-NEXT: negb %cl -; FALLBACK4-NEXT: movsbq %cl, %r8 -; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r9 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10 -; FALLBACK4-NEXT: movq %r10, %rdi -; FALLBACK4-NEXT: shrq %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %rdi -; FALLBACK4-NEXT: orq %r9, %rdi -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8 -; FALLBACK4-NEXT: movq %r8, %r11 -; FALLBACK4-NEXT: shrq %r11 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: orq %r10, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r8 -; FALLBACK4-NEXT: movq %r9, %r10 -; FALLBACK4-NEXT: shrq %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r9 -; FALLBACK4-NEXT: movq %r9, (%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %r11, 16(%rdx) -; FALLBACK4-NEXT: movq %rdi, 24(%rdx) -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: shl_32bytes_dwordOff: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movzbl (%rsi), %eax -; FALLBACK5-NEXT: movl %eax, %ecx -; FALLBACK5-NEXT: shlb $5, %cl -; FALLBACK5-NEXT: xorps %xmm2, %xmm2 -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: shlb $2, %al -; FALLBACK5-NEXT: andb $24, %al -; FALLBACK5-NEXT: negb %al -; FALLBACK5-NEXT: movsbq %al, %rax -; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK5-NEXT: shldq %cl, %rax, %rsi -; FALLBACK5-NEXT: movq %r8, %r9 -; FALLBACK5-NEXT: shlq %cl, %r9 -; FALLBACK5-NEXT: shldq %cl, %r8, %rax -; FALLBACK5-NEXT: movq %rax, 8(%rdx) -; FALLBACK5-NEXT: movq %rsi, 16(%rdx) -; FALLBACK5-NEXT: movq %rdi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: shl_32bytes_dwordOff: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %esi -; FALLBACK6-NEXT: movl %esi, %eax -; FALLBACK6-NEXT: shlb $5, %al -; FALLBACK6-NEXT: xorps %xmm2, %xmm2 -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movl %eax, %ecx -; FALLBACK6-NEXT: shlb $2, %sil -; FALLBACK6-NEXT: andb $24, %sil -; FALLBACK6-NEXT: negb %sil -; FALLBACK6-NEXT: movsbq %sil, %rsi -; FALLBACK6-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK6-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK6-NEXT: shrq %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 -; FALLBACK6-NEXT: orq %rdi, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %rsi -; FALLBACK6-NEXT: shlxq %rcx, %rsi, %r10 -; FALLBACK6-NEXT: shrq %rsi -; FALLBACK6-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK6-NEXT: orq %r9, %rsi -; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rcx -; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %rcx, (%rdx) -; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rsi, 16(%rdx) -; FALLBACK6-NEXT: movq %r8, 24(%rdx) -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: shl_32bytes_dwordOff: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movzbl (%rsi), %eax -; FALLBACK7-NEXT: movl %eax, %ecx -; FALLBACK7-NEXT: shlb $5, %cl -; FALLBACK7-NEXT: xorps %xmm2, %xmm2 -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: shlb $2, %al -; FALLBACK7-NEXT: andb $24, %al -; FALLBACK7-NEXT: negb %al -; FALLBACK7-NEXT: movsbq %al, %rax -; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK7-NEXT: shldq %cl, %rax, %rsi -; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shldq %cl, %r8, %rax -; FALLBACK7-NEXT: movq %rax, 8(%rdx) -; FALLBACK7-NEXT: movq %rsi, 16(%rdx) -; FALLBACK7-NEXT: movq %rdi, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: shl_32bytes_dwordOff: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: movzbl (%rsi), %ecx -; FALLBACK8-NEXT: movl %ecx, %eax -; FALLBACK8-NEXT: shlb $5, %al -; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: shlb $2, %cl -; FALLBACK8-NEXT: andb $24, %cl -; FALLBACK8-NEXT: negb %cl -; FALLBACK8-NEXT: movsbq %cl, %r8 -; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r9 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10 -; FALLBACK8-NEXT: movq %r10, %rdi -; FALLBACK8-NEXT: shrq %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %rdi -; FALLBACK8-NEXT: orq %r9, %rdi -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8 -; FALLBACK8-NEXT: movq %r8, %r11 -; FALLBACK8-NEXT: shrq %r11 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: orq %r10, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r8 -; FALLBACK8-NEXT: movq %r9, %r10 -; FALLBACK8-NEXT: shrq %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, (%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %r11, 16(%rdx) -; FALLBACK8-NEXT: movq %rdi, 24(%rdx) -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: shl_32bytes_dwordOff: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: movzbl (%rsi), %eax -; FALLBACK9-NEXT: movl %eax, %ecx -; FALLBACK9-NEXT: shlb $5, %cl -; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: shlb $2, %al -; FALLBACK9-NEXT: andb $24, %al -; FALLBACK9-NEXT: negb %al -; FALLBACK9-NEXT: movsbq %al, %rax -; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK9-NEXT: shldq %cl, %rax, %rsi -; FALLBACK9-NEXT: movq %r8, %r9 -; FALLBACK9-NEXT: shlq %cl, %r9 -; FALLBACK9-NEXT: shldq %cl, %r8, %rax -; FALLBACK9-NEXT: movq %rax, 8(%rdx) -; FALLBACK9-NEXT: movq %rsi, 16(%rdx) -; FALLBACK9-NEXT: movq %rdi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: shl_32bytes_dwordOff: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %esi -; FALLBACK10-NEXT: movl %esi, %eax -; FALLBACK10-NEXT: shlb $5, %al -; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movl %eax, %ecx -; FALLBACK10-NEXT: shlb $2, %sil -; FALLBACK10-NEXT: andb $24, %sil -; FALLBACK10-NEXT: negb %sil -; FALLBACK10-NEXT: movsbq %sil, %rsi -; FALLBACK10-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK10-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK10-NEXT: shrq %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 -; FALLBACK10-NEXT: orq %rdi, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %rsi -; FALLBACK10-NEXT: shlxq %rcx, %rsi, %r10 -; FALLBACK10-NEXT: shrq %rsi -; FALLBACK10-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK10-NEXT: orq %r9, %rsi -; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rcx -; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %rcx, (%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rsi, 16(%rdx) -; FALLBACK10-NEXT: movq %r8, 24(%rdx) -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: shl_32bytes_dwordOff: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: movzbl (%rsi), %eax -; FALLBACK11-NEXT: movl %eax, %ecx -; FALLBACK11-NEXT: shlb $5, %cl -; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: shlb $2, %al -; FALLBACK11-NEXT: andb $24, %al -; FALLBACK11-NEXT: negb %al -; FALLBACK11-NEXT: movsbq %al, %rax -; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK11-NEXT: shldq %cl, %rax, %rsi -; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shldq %cl, %r8, %rax -; FALLBACK11-NEXT: movq %rax, 8(%rdx) -; FALLBACK11-NEXT: movq %rsi, 16(%rdx) -; FALLBACK11-NEXT: movq %rdi, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: shl_32bytes_dwordOff: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK12-NEXT: movzbl (%rsi), %ecx -; FALLBACK12-NEXT: movl %ecx, %eax -; FALLBACK12-NEXT: shlb $5, %al -; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: shlb $2, %cl -; FALLBACK12-NEXT: andb $24, %cl -; FALLBACK12-NEXT: negb %cl -; FALLBACK12-NEXT: movsbq %cl, %r8 -; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r9 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10 -; FALLBACK12-NEXT: movq %r10, %rdi -; FALLBACK12-NEXT: shrq %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %rdi -; FALLBACK12-NEXT: orq %r9, %rdi -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8 -; FALLBACK12-NEXT: movq %r8, %r11 -; FALLBACK12-NEXT: shrq %r11 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: orq %r10, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r8 -; FALLBACK12-NEXT: movq %r9, %r10 -; FALLBACK12-NEXT: shrq %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, (%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %r11, 16(%rdx) -; FALLBACK12-NEXT: movq %rdi, 24(%rdx) -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: shl_32bytes_dwordOff: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK13-NEXT: movzbl (%rsi), %eax -; FALLBACK13-NEXT: movl %eax, %ecx -; FALLBACK13-NEXT: shlb $5, %cl -; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: shlb $2, %al -; FALLBACK13-NEXT: andb $24, %al -; FALLBACK13-NEXT: negb %al -; FALLBACK13-NEXT: movsbq %al, %rax -; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK13-NEXT: shldq %cl, %rax, %rsi -; FALLBACK13-NEXT: movq %r8, %r9 -; FALLBACK13-NEXT: shlq %cl, %r9 -; FALLBACK13-NEXT: shldq %cl, %r8, %rax -; FALLBACK13-NEXT: movq %rax, 8(%rdx) -; FALLBACK13-NEXT: movq %rsi, 16(%rdx) -; FALLBACK13-NEXT: movq %rdi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: shl_32bytes_dwordOff: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %esi -; FALLBACK14-NEXT: movl %esi, %eax -; FALLBACK14-NEXT: shlb $5, %al -; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movl %eax, %ecx -; FALLBACK14-NEXT: shlb $2, %sil -; FALLBACK14-NEXT: andb $24, %sil -; FALLBACK14-NEXT: negb %sil -; FALLBACK14-NEXT: movsbq %sil, %rsi -; FALLBACK14-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK14-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK14-NEXT: shrq %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 -; FALLBACK14-NEXT: orq %rdi, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %rsi -; FALLBACK14-NEXT: shlxq %rcx, %rsi, %r10 -; FALLBACK14-NEXT: shrq %rsi -; FALLBACK14-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK14-NEXT: orq %r9, %rsi -; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rcx -; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %rcx, (%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rsi, 16(%rdx) -; FALLBACK14-NEXT: movq %r8, 24(%rdx) -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: shl_32bytes_dwordOff: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK15-NEXT: movzbl (%rsi), %eax -; FALLBACK15-NEXT: movl %eax, %ecx -; FALLBACK15-NEXT: shlb $5, %cl -; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: shlb $2, %al -; FALLBACK15-NEXT: andb $24, %al -; FALLBACK15-NEXT: negb %al -; FALLBACK15-NEXT: movsbq %al, %rax -; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK15-NEXT: shldq %cl, %rax, %rsi -; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shldq %cl, %r8, %rax -; FALLBACK15-NEXT: movq %rax, 8(%rdx) -; FALLBACK15-NEXT: movq %rsi, 16(%rdx) -; FALLBACK15-NEXT: movq %rdi, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $2, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%r10), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%r10), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%r10), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%r10), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $2, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $2, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rdi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rsi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rsi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rcx, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $2, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $2, %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $2, %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $2, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %sil, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rsi, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rsi, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r9, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $2, %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $2, %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movsbq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%r8), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%r8), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $2, %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $2, %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %sil, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rsi, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rsi, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r9, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r10, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $2, %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq ; ; X86-SSE2-LABEL: shl_32bytes_dwordOff: ; X86-SSE2: # %bb.0: @@ -8789,2192 +7209,1656 @@ define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou } define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: ashr_32bytes: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: leal (,%rsi,8), %eax -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: sarq $63, %rdi -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $24, %sil -; FALLBACK0-NEXT: movzbl %sil, %r9d -; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r11, %r8 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %rdi, %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: sarq %cl, %r9 -; FALLBACK0-NEXT: movq %r9, 24(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) -; FALLBACK0-NEXT: movq %rdi, (%rdx) -; FALLBACK0-NEXT: movq %r8, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: ashr_32bytes: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: leal (,%rsi,8), %ecx -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: sarq $63, %rdi -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $24, %sil -; FALLBACK1-NEXT: movzbl %sil, %eax -; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi -; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK1-NEXT: movq %r8, %r9 -; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: sarq %cl, %rax -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rax, 24(%rdx) -; FALLBACK1-NEXT: movq %rdi, (%rdx) -; FALLBACK1-NEXT: movq %r9, 8(%rdx) -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: ashr_32bytes: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: leal (,%rsi,8), %eax -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: sarq $63, %rdi -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movl %eax, %ecx -; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %esi -; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %r8 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 -; FALLBACK2-NEXT: orq %r9, %r10 -; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r9, %rdi -; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 -; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 -; FALLBACK2-NEXT: shlxq %rax, %r9, %rax -; FALLBACK2-NEXT: orq %r8, %rax -; FALLBACK2-NEXT: sarxq %rcx, %rsi, %rcx -; FALLBACK2-NEXT: movq %rcx, 24(%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r10, 8(%rdx) -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: ashr_32bytes: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: leal (,%rsi,8), %ecx -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: sarq $63, %rdi -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $24, %sil -; FALLBACK3-NEXT: movzbl %sil, %eax -; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi -; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK3-NEXT: movq %r8, %r9 -; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rax, 24(%rdx) -; FALLBACK3-NEXT: movq %rdi, (%rdx) -; FALLBACK3-NEXT: movq %r9, 8(%rdx) -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: ashr_32bytes: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movq 16(%rdi), %rcx -; FALLBACK4-NEXT: movq 24(%rdi), %rdi -; FALLBACK4-NEXT: movzbl (%rsi), %esi -; FALLBACK4-NEXT: leal (,%rsi,8), %eax -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: sarq $63, %rdi -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: andb $24, %sil -; FALLBACK4-NEXT: movzbl %sil, %r9d -; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK4-NEXT: movq %r10, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r11, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: addq %r10, %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: sarq %cl, %r9 -; FALLBACK4-NEXT: movq %r9, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) -; FALLBACK4-NEXT: movq %rdi, (%rdx) -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: ashr_32bytes: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movq 16(%rdi), %rax -; FALLBACK5-NEXT: movq 24(%rdi), %rdi -; FALLBACK5-NEXT: movzbl (%rsi), %esi -; FALLBACK5-NEXT: leal (,%rsi,8), %ecx -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: sarq $63, %rdi -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: andb $24, %sil -; FALLBACK5-NEXT: movzbl %sil, %eax -; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK5-NEXT: movq %rdi, %r8 -; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK5-NEXT: movq %rax, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: sarq %cl, %rsi -; FALLBACK5-NEXT: movq %r10, 8(%rdx) -; FALLBACK5-NEXT: movq %r8, 16(%rdx) -; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: ashr_32bytes: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movq 16(%rdi), %rcx -; FALLBACK6-NEXT: movq 24(%rdi), %rdi -; FALLBACK6-NEXT: movzbl (%rsi), %esi -; FALLBACK6-NEXT: leal (,%rsi,8), %eax -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: sarq $63, %rdi -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movl %eax, %ecx -; FALLBACK6-NEXT: andb $24, %sil -; FALLBACK6-NEXT: movzbl %sil, %esi -; FALLBACK6-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: movq -64(%rsp,%rsi), %r8 -; FALLBACK6-NEXT: movq -56(%rsp,%rsi), %r9 -; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 -; FALLBACK6-NEXT: orq %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rcx, %r9, %rdi -; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK6-NEXT: leaq (%rsi,%rsi), %r11 -; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 -; FALLBACK6-NEXT: orq %rdi, %r11 -; FALLBACK6-NEXT: shrxq %rcx, %r8, %rdi -; FALLBACK6-NEXT: addq %r9, %r9 -; FALLBACK6-NEXT: shlxq %rax, %r9, %rax -; FALLBACK6-NEXT: orq %rdi, %rax -; FALLBACK6-NEXT: sarxq %rcx, %rsi, %rcx -; FALLBACK6-NEXT: movq %rcx, 24(%rdx) -; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %r11, 16(%rdx) -; FALLBACK6-NEXT: movq %r10, (%rdx) -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: ashr_32bytes: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movq 16(%rdi), %rax -; FALLBACK7-NEXT: movq 24(%rdi), %rdi -; FALLBACK7-NEXT: movzbl (%rsi), %esi -; FALLBACK7-NEXT: leal (,%rsi,8), %ecx -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: sarq $63, %rdi -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: andb $24, %sil -; FALLBACK7-NEXT: movzbl %sil, %eax -; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK7-NEXT: movq %rdi, %r8 -; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK7-NEXT: movq %rax, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax -; FALLBACK7-NEXT: movq %r10, 8(%rdx) -; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rax, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: ashr_32bytes: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK8-NEXT: movq 16(%rdi), %rcx -; FALLBACK8-NEXT: movq 24(%rdi), %rdi -; FALLBACK8-NEXT: movzbl (%rsi), %esi -; FALLBACK8-NEXT: leal (,%rsi,8), %eax -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: sarq $63, %rdi -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: andb $24, %sil -; FALLBACK8-NEXT: movzbl %sil, %r9d -; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq %r10, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r11, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %r10, %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: sarq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) -; FALLBACK8-NEXT: movq %rdi, (%rdx) -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: ashr_32bytes: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK9-NEXT: movq 16(%rdi), %rax -; FALLBACK9-NEXT: movq 24(%rdi), %rdi -; FALLBACK9-NEXT: movzbl (%rsi), %esi -; FALLBACK9-NEXT: leal (,%rsi,8), %ecx -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: sarq $63, %rdi -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: andb $24, %sil -; FALLBACK9-NEXT: movzbl %sil, %eax -; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq %rdi, %r8 -; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: sarq %cl, %rsi -; FALLBACK9-NEXT: movq %r10, 8(%rdx) -; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: ashr_32bytes: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK10-NEXT: movq 16(%rdi), %rcx -; FALLBACK10-NEXT: movq 24(%rdi), %rdi -; FALLBACK10-NEXT: movzbl (%rsi), %esi -; FALLBACK10-NEXT: leal (,%rsi,8), %eax -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: sarq $63, %rdi -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movl %eax, %ecx -; FALLBACK10-NEXT: andb $24, %sil -; FALLBACK10-NEXT: movzbl %sil, %esi -; FALLBACK10-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: movq -64(%rsp,%rsi), %r8 -; FALLBACK10-NEXT: movq -56(%rsp,%rsi), %r9 -; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK10-NEXT: shlxq %rax, %r10, %r10 -; FALLBACK10-NEXT: orq %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rcx, %r9, %rdi -; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r11 -; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 -; FALLBACK10-NEXT: orq %rdi, %r11 -; FALLBACK10-NEXT: shrxq %rcx, %r8, %rdi -; FALLBACK10-NEXT: addq %r9, %r9 -; FALLBACK10-NEXT: shlxq %rax, %r9, %rax -; FALLBACK10-NEXT: orq %rdi, %rax -; FALLBACK10-NEXT: sarxq %rcx, %rsi, %rcx -; FALLBACK10-NEXT: movq %rcx, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %r11, 16(%rdx) -; FALLBACK10-NEXT: movq %r10, (%rdx) -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: ashr_32bytes: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK11-NEXT: movq 16(%rdi), %rax -; FALLBACK11-NEXT: movq 24(%rdi), %rdi -; FALLBACK11-NEXT: movzbl (%rsi), %esi -; FALLBACK11-NEXT: leal (,%rsi,8), %ecx -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: sarq $63, %rdi -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: andb $24, %sil -; FALLBACK11-NEXT: movzbl %sil, %eax -; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK11-NEXT: movq %rdi, %r8 -; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax -; FALLBACK11-NEXT: movq %r10, 8(%rdx) -; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rax, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: ashr_32bytes: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK12-NEXT: movq 16(%rdi), %rcx -; FALLBACK12-NEXT: movq 24(%rdi), %rdi -; FALLBACK12-NEXT: movzbl (%rsi), %esi -; FALLBACK12-NEXT: leal (,%rsi,8), %eax -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: sarq $63, %rdi -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: andb $24, %sil -; FALLBACK12-NEXT: movzbl %sil, %r9d -; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r11, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %r10, %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: sarq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) -; FALLBACK12-NEXT: movq %rdi, (%rdx) -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: ashr_32bytes: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK13-NEXT: movq 16(%rdi), %rax -; FALLBACK13-NEXT: movq 24(%rdi), %rdi -; FALLBACK13-NEXT: movzbl (%rsi), %esi -; FALLBACK13-NEXT: leal (,%rsi,8), %ecx -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: sarq $63, %rdi -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: andb $24, %sil -; FALLBACK13-NEXT: movzbl %sil, %eax -; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK13-NEXT: movq %rdi, %r8 -; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK13-NEXT: movq %rax, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: sarq %cl, %rsi -; FALLBACK13-NEXT: movq %r10, 8(%rdx) -; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: ashr_32bytes: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK14-NEXT: movq 16(%rdi), %rcx -; FALLBACK14-NEXT: movq 24(%rdi), %rdi -; FALLBACK14-NEXT: movzbl (%rsi), %esi -; FALLBACK14-NEXT: leal (,%rsi,8), %eax -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: sarq $63, %rdi -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movl %eax, %ecx -; FALLBACK14-NEXT: andb $24, %sil -; FALLBACK14-NEXT: movzbl %sil, %esi -; FALLBACK14-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: movq -64(%rsp,%rsi), %r8 -; FALLBACK14-NEXT: movq -56(%rsp,%rsi), %r9 -; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK14-NEXT: shlxq %rax, %r10, %r10 -; FALLBACK14-NEXT: orq %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rcx, %r9, %rdi -; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r11 -; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 -; FALLBACK14-NEXT: orq %rdi, %r11 -; FALLBACK14-NEXT: shrxq %rcx, %r8, %rdi -; FALLBACK14-NEXT: addq %r9, %r9 -; FALLBACK14-NEXT: shlxq %rax, %r9, %rax -; FALLBACK14-NEXT: orq %rdi, %rax -; FALLBACK14-NEXT: sarxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: movq %rcx, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %r11, 16(%rdx) -; FALLBACK14-NEXT: movq %r10, (%rdx) -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: ashr_32bytes: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK15-NEXT: movq 16(%rdi), %rax -; FALLBACK15-NEXT: movq 24(%rdi), %rdi -; FALLBACK15-NEXT: movzbl (%rsi), %esi -; FALLBACK15-NEXT: leal (,%rsi,8), %ecx -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: sarq $63, %rdi -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: andb $24, %sil -; FALLBACK15-NEXT: movzbl %sil, %eax -; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq %rdi, %r8 -; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax -; FALLBACK15-NEXT: movq %r10, 8(%rdx) -; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rax, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: retq -; -; FALLBACK16-LABEL: ashr_32bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $108, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK16-NEXT: movl (%esi), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%esi), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%esi), %ebx -; FALLBACK16-NEXT: movl 12(%esi), %ebp -; FALLBACK16-NEXT: movl 16(%esi), %edi -; FALLBACK16-NEXT: movzbl (%eax), %ecx -; FALLBACK16-NEXT: movl 20(%esi), %edx -; FALLBACK16-NEXT: movl 24(%esi), %eax -; FALLBACK16-NEXT: movl 28(%esi), %esi -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, %edx -; FALLBACK16-NEXT: shlb $3, %dl -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: sarl $31, %esi -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $28, %cl -; FALLBACK16-NEXT: movzbl %cl, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax -; FALLBACK16-NEXT: movl %eax, %ebx -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movb %dl, %ch -; FALLBACK16-NEXT: notb %ch -; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: addl %eax, %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %esi, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp -; FALLBACK16-NEXT: movl %ebp, %esi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl %edx, %ebx -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: movl 48(%esp,%eax), %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%edx,%edx), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %esi, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebx, %edx -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebp, %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK16-NEXT: movl 52(%esp,%esi), %edi -; FALLBACK16-NEXT: movl %edi, %eax -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 56(%esp,%esi), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %eax, %esi -; FALLBACK16-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %eax, %edi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl 60(%esp,%eax), %eax -; FALLBACK16-NEXT: leal (%eax,%eax), %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; FALLBACK16-NEXT: sarl %cl, %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl %eax, 28(%ecx) -; FALLBACK16-NEXT: movl %edx, 24(%ecx) -; FALLBACK16-NEXT: movl %edi, 16(%ecx) -; FALLBACK16-NEXT: movl %esi, 20(%ecx) -; FALLBACK16-NEXT: movl %ebp, 8(%ecx) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, 12(%ecx) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, (%ecx) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, 4(%ecx) -; FALLBACK16-NEXT: addl $108, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: ashr_32bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $92, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%ecx), %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%ecx), %edx -; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 12(%ecx), %ebp -; FALLBACK17-NEXT: movl 16(%ecx), %ebx -; FALLBACK17-NEXT: movzbl (%eax), %eax -; FALLBACK17-NEXT: movl 20(%ecx), %edi -; FALLBACK17-NEXT: movl 24(%ecx), %edx -; FALLBACK17-NEXT: movl 28(%ecx), %esi -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, %ecx -; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: sarl $31, %esi -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: andb $28, %al -; FALLBACK17-NEXT: movzbl %al, %ebp -; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %edx, %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %esi -; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi -; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %edx, %edi -; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 24(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shrdl %cl, %edx, %esi -; FALLBACK17-NEXT: sarl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl %ebx, 16(%ebp) -; FALLBACK17-NEXT: movl %edi, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %esi, (%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) -; FALLBACK17-NEXT: addl $92, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: ashr_32bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $108, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK18-NEXT: movl (%esi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%esi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%esi), %ebx -; FALLBACK18-NEXT: movl 12(%esi), %ebp -; FALLBACK18-NEXT: movl 16(%esi), %edi -; FALLBACK18-NEXT: movzbl (%edx), %edx -; FALLBACK18-NEXT: movl 20(%esi), %ecx -; FALLBACK18-NEXT: movl 24(%esi), %eax -; FALLBACK18-NEXT: movl 28(%esi), %esi -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, %ecx -; FALLBACK18-NEXT: shlb $3, %cl -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: sarl $31, %esi -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: andb $28, %dl -; FALLBACK18-NEXT: movzbl %dl, %esi -; FALLBACK18-NEXT: movl 36(%esp,%esi), %edx -; FALLBACK18-NEXT: movl 40(%esp,%esi), %ebp -; FALLBACK18-NEXT: shrxl %eax, %edx, %edi -; FALLBACK18-NEXT: notb %cl -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ebx -; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK18-NEXT: orl %edi, %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%esi), %edi -; FALLBACK18-NEXT: addl %edx, %edx -; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK18-NEXT: orl %edi, %edx -; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%esi), %edx -; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: addl %edx, %edx -; FALLBACK18-NEXT: shlxl %ecx, %edx, %ebx -; FALLBACK18-NEXT: movl 44(%esp,%esi), %edx -; FALLBACK18-NEXT: shrxl %eax, %edx, %edi -; FALLBACK18-NEXT: orl %edi, %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %ebp, %edi -; FALLBACK18-NEXT: movl %eax, %ebp -; FALLBACK18-NEXT: addl %edx, %edx -; FALLBACK18-NEXT: shlxl %ecx, %edx, %eax -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%esi), %edi -; FALLBACK18-NEXT: leal (%edi,%edi), %edx -; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK18-NEXT: movl 52(%esp,%esi), %eax -; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebx -; FALLBACK18-NEXT: orl %ebx, %edx -; FALLBACK18-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %ecx, %eax, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl 60(%esp,%esi), %esi -; FALLBACK18-NEXT: leal (%esi,%esi), %ebx -; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ecx -; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi -; FALLBACK18-NEXT: orl %edi, %ecx -; FALLBACK18-NEXT: sarxl %ebp, %esi, %esi -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edi -; FALLBACK18-NEXT: movl %esi, 28(%edi) -; FALLBACK18-NEXT: movl %ecx, 24(%edi) -; FALLBACK18-NEXT: movl %eax, 16(%edi) -; FALLBACK18-NEXT: movl %edx, 20(%edi) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 8(%edi) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 12(%edi) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, (%edi) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 4(%edi) -; FALLBACK18-NEXT: addl $108, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: ashr_32bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $92, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ecx), %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ecx), %edx -; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 12(%ecx), %ebp -; FALLBACK19-NEXT: movl 16(%ecx), %ebx -; FALLBACK19-NEXT: movzbl (%eax), %eax -; FALLBACK19-NEXT: movl 20(%ecx), %edi -; FALLBACK19-NEXT: movl 24(%ecx), %edx -; FALLBACK19-NEXT: movl 28(%ecx), %esi -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, %ecx -; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: sarl $31, %esi -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: andb $28, %al -; FALLBACK19-NEXT: movzbl %al, %ebp -; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %esi, %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl %edx, %esi -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi -; FALLBACK19-NEXT: shrdl %cl, %edi, %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl %eax, 24(%ebp) -; FALLBACK19-NEXT: sarxl %ecx, %edi, %eax -; FALLBACK19-NEXT: movl %eax, 28(%ebp) -; FALLBACK19-NEXT: movl %ebx, 16(%ebp) -; FALLBACK19-NEXT: movl %esi, 20(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 8(%ebp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: movl %edx, (%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 4(%ebp) -; FALLBACK19-NEXT: addl $92, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: ashr_32bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $108, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movl 16(%ecx), %esi -; FALLBACK20-NEXT: movl 20(%ecx), %edi -; FALLBACK20-NEXT: movl 24(%ecx), %ebx -; FALLBACK20-NEXT: movl 28(%ecx), %edx -; FALLBACK20-NEXT: movzbl (%eax), %eax -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shlb $3, %cl -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: sarl $31, %edx -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: andb $28, %al -; FALLBACK20-NEXT: movzbl %al, %edi -; FALLBACK20-NEXT: movl 32(%esp,%edi), %eax -; FALLBACK20-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl %ecx, %edx -; FALLBACK20-NEXT: movb %cl, %dh -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %eax, %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, %eax -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl 48(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %eax, %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, %eax -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %eax -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebp, %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %eax, %ebp -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 60(%esp,%edi), %eax -; FALLBACK20-NEXT: leal (%eax,%eax), %edi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: sarl %cl, %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movl %eax, 28(%ecx) -; FALLBACK20-NEXT: movl %esi, 4(%ecx) -; FALLBACK20-NEXT: movl %edi, 24(%ecx) -; FALLBACK20-NEXT: movl %ebp, 16(%ecx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, 20(%ecx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, 8(%ecx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, 12(%ecx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, (%ecx) -; FALLBACK20-NEXT: addl $108, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: ashr_32bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $108, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movups (%ecx), %xmm0 -; FALLBACK21-NEXT: movl 16(%ecx), %esi -; FALLBACK21-NEXT: movl 20(%ecx), %edi -; FALLBACK21-NEXT: movl 24(%ecx), %ebx -; FALLBACK21-NEXT: movl 28(%ecx), %edx -; FALLBACK21-NEXT: movzbl (%eax), %eax -; FALLBACK21-NEXT: movl %eax, %ecx -; FALLBACK21-NEXT: shlb $3, %cl -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: sarl $31, %edx -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: andb $28, %al -; FALLBACK21-NEXT: movzbl %al, %ebp -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl %edi, %esi -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %esi, 4(%ebp) -; FALLBACK21-NEXT: movl %ebx, 24(%ebp) -; FALLBACK21-NEXT: shrdl %cl, %edi, %edx -; FALLBACK21-NEXT: sarl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %edx, (%ebp) -; FALLBACK21-NEXT: addl $108, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: ashr_32bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $108, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movl 16(%ecx), %esi -; FALLBACK22-NEXT: movl 20(%ecx), %edi -; FALLBACK22-NEXT: movl 24(%ecx), %ebp -; FALLBACK22-NEXT: movl 28(%ecx), %ecx -; FALLBACK22-NEXT: movzbl (%eax), %edx -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: shlb $3, %bl -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: sarl $31, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ebx, %eax -; FALLBACK22-NEXT: andb $28, %dl -; FALLBACK22-NEXT: movzbl %dl, %ecx -; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%ecx), %edx -; FALLBACK22-NEXT: movl %eax, %ebp -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: movl 36(%esp,%ecx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK22-NEXT: orl %edx, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%esp,%ecx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %edx -; FALLBACK22-NEXT: shlxl %ebx, %edx, %edi -; FALLBACK22-NEXT: movl 44(%esp,%ecx), %edx -; FALLBACK22-NEXT: shrxl %ebp, %edx, %esi -; FALLBACK22-NEXT: orl %esi, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %edx, %edx -; FALLBACK22-NEXT: shlxl %ebx, %edx, %edi -; FALLBACK22-NEXT: movl 40(%esp,%ecx), %edx -; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ebp, %edx, %esi -; FALLBACK22-NEXT: movl %ebp, %edx -; FALLBACK22-NEXT: orl %esi, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 56(%esp,%ecx), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: movl 52(%esp,%ecx), %eax -; FALLBACK22-NEXT: shrxl %edx, %eax, %edi -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %esi, %eax -; FALLBACK22-NEXT: movl 60(%esp,%ecx), %ecx -; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %eax, %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %eax -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; FALLBACK22-NEXT: orl %edx, %eax -; FALLBACK22-NEXT: sarxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %ecx, 28(%edx) -; FALLBACK22-NEXT: movl %eax, 4(%edx) -; FALLBACK22-NEXT: movl %esi, 24(%edx) -; FALLBACK22-NEXT: movl %edi, 16(%edx) -; FALLBACK22-NEXT: movl %ebp, 20(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 8(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 12(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, (%edx) -; FALLBACK22-NEXT: addl $108, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: ashr_32bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $108, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movups (%ecx), %xmm0 -; FALLBACK23-NEXT: movl 16(%ecx), %esi -; FALLBACK23-NEXT: movl 20(%ecx), %edi -; FALLBACK23-NEXT: movl 24(%ecx), %ebx -; FALLBACK23-NEXT: movl 28(%ecx), %edx -; FALLBACK23-NEXT: movzbl (%eax), %eax -; FALLBACK23-NEXT: movl %eax, %ecx -; FALLBACK23-NEXT: shlb $3, %cl -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: sarl $31, %edx -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: andb $28, %al -; FALLBACK23-NEXT: movzbl %al, %ebx -; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %edi -; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi -; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl %ebx, 4(%eax) -; FALLBACK23-NEXT: movl %ebp, 24(%eax) -; FALLBACK23-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK23-NEXT: movl %ebx, 28(%eax) -; FALLBACK23-NEXT: movl %esi, 16(%eax) -; FALLBACK23-NEXT: movl %edi, 20(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: movl %esi, 8(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: movl %esi, 12(%eax) -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, (%eax) -; FALLBACK23-NEXT: addl $108, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: ashr_32bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $108, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK24-NEXT: movl 16(%ecx), %esi -; FALLBACK24-NEXT: movl 20(%ecx), %edi -; FALLBACK24-NEXT: movl 24(%ecx), %ebx -; FALLBACK24-NEXT: movl 28(%ecx), %edx -; FALLBACK24-NEXT: movzbl (%eax), %eax -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shlb $3, %cl -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: sarl $31, %edx -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: andb $28, %al -; FALLBACK24-NEXT: movzbl %al, %edi -; FALLBACK24-NEXT: movl 32(%esp,%edi), %eax -; FALLBACK24-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl %ecx, %edx -; FALLBACK24-NEXT: movb %cl, %dh -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %eax, %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, %eax -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl 48(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %eax, %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, %eax -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %eax -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebp, %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %eax, %ebp -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 60(%esp,%edi), %eax -; FALLBACK24-NEXT: leal (%eax,%eax), %edi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: sarl %cl, %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: movl %eax, 28(%ecx) -; FALLBACK24-NEXT: movl %esi, 4(%ecx) -; FALLBACK24-NEXT: movl %edi, 24(%ecx) -; FALLBACK24-NEXT: movl %ebp, 16(%ecx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, 20(%ecx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, 8(%ecx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, 12(%ecx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, (%ecx) -; FALLBACK24-NEXT: addl $108, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: ashr_32bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $108, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK25-NEXT: movl 16(%ecx), %esi -; FALLBACK25-NEXT: movl 20(%ecx), %edi -; FALLBACK25-NEXT: movl 24(%ecx), %ebx -; FALLBACK25-NEXT: movl 28(%ecx), %edx -; FALLBACK25-NEXT: movzbl (%eax), %eax -; FALLBACK25-NEXT: movl %eax, %ecx -; FALLBACK25-NEXT: shlb $3, %cl -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: sarl $31, %edx -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: andb $28, %al -; FALLBACK25-NEXT: movzbl %al, %ebp -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl %edi, %esi -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %esi, 4(%ebp) -; FALLBACK25-NEXT: movl %ebx, 24(%ebp) -; FALLBACK25-NEXT: shrdl %cl, %edi, %edx -; FALLBACK25-NEXT: sarl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %edx, (%ebp) -; FALLBACK25-NEXT: addl $108, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: ashr_32bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $108, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movl 16(%ecx), %esi -; FALLBACK26-NEXT: movl 20(%ecx), %edi -; FALLBACK26-NEXT: movl 24(%ecx), %ebp -; FALLBACK26-NEXT: movl 28(%ecx), %ecx -; FALLBACK26-NEXT: movzbl (%eax), %edx -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: shlb $3, %bl -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: sarl $31, %ecx -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ebx, %eax -; FALLBACK26-NEXT: andb $28, %dl -; FALLBACK26-NEXT: movzbl %dl, %ecx -; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%ecx), %edx -; FALLBACK26-NEXT: movl %eax, %ebp -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: movl 36(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK26-NEXT: orl %edx, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %edx -; FALLBACK26-NEXT: shlxl %ebx, %edx, %edi -; FALLBACK26-NEXT: movl 44(%esp,%ecx), %edx -; FALLBACK26-NEXT: shrxl %ebp, %edx, %esi -; FALLBACK26-NEXT: orl %esi, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %edx, %edx -; FALLBACK26-NEXT: shlxl %ebx, %edx, %edi -; FALLBACK26-NEXT: movl 40(%esp,%ecx), %edx -; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebp, %edx, %esi -; FALLBACK26-NEXT: movl %ebp, %edx -; FALLBACK26-NEXT: orl %esi, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 56(%esp,%ecx), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK26-NEXT: movl 52(%esp,%ecx), %eax -; FALLBACK26-NEXT: shrxl %edx, %eax, %edi -; FALLBACK26-NEXT: orl %edi, %ebp -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: shrxl %edx, %esi, %eax -; FALLBACK26-NEXT: movl 60(%esp,%ecx), %ecx -; FALLBACK26-NEXT: leal (%ecx,%ecx), %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %eax -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; FALLBACK26-NEXT: orl %edx, %eax -; FALLBACK26-NEXT: sarxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %ecx, 28(%edx) -; FALLBACK26-NEXT: movl %eax, 4(%edx) -; FALLBACK26-NEXT: movl %esi, 24(%edx) -; FALLBACK26-NEXT: movl %edi, 16(%edx) -; FALLBACK26-NEXT: movl %ebp, 20(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 8(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, (%edx) -; FALLBACK26-NEXT: addl $108, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: ashr_32bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $108, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK27-NEXT: movl 16(%ecx), %esi -; FALLBACK27-NEXT: movl 20(%ecx), %edi -; FALLBACK27-NEXT: movl 24(%ecx), %ebx -; FALLBACK27-NEXT: movl 28(%ecx), %edx -; FALLBACK27-NEXT: movzbl (%eax), %eax -; FALLBACK27-NEXT: movl %eax, %ecx -; FALLBACK27-NEXT: shlb $3, %cl -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: sarl $31, %edx -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: andb $28, %al -; FALLBACK27-NEXT: movzbl %al, %ebx -; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %edi -; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi -; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl %ebx, 4(%eax) -; FALLBACK27-NEXT: movl %ebp, 24(%eax) -; FALLBACK27-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK27-NEXT: movl %ebx, 28(%eax) -; FALLBACK27-NEXT: movl %esi, 16(%eax) -; FALLBACK27-NEXT: movl %edi, 20(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: movl %esi, 8(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: movl %esi, 12(%eax) -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, (%eax) -; FALLBACK27-NEXT: addl $108, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: ashr_32bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $108, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK28-NEXT: movl 16(%ecx), %esi -; FALLBACK28-NEXT: movl 20(%ecx), %edi -; FALLBACK28-NEXT: movl 24(%ecx), %ebx -; FALLBACK28-NEXT: movl 28(%ecx), %edx -; FALLBACK28-NEXT: movzbl (%eax), %eax -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shlb $3, %cl -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: sarl $31, %edx -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: andb $28, %al -; FALLBACK28-NEXT: movzbl %al, %edi -; FALLBACK28-NEXT: movl 32(%esp,%edi), %eax -; FALLBACK28-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl %ecx, %edx -; FALLBACK28-NEXT: movb %cl, %dh -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %eax, %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, %eax -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl 48(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %eax, %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, %eax -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %eax -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebp, %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %eax, %ebp -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 60(%esp,%edi), %eax -; FALLBACK28-NEXT: leal (%eax,%eax), %edi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: sarl %cl, %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: movl %eax, 28(%ecx) -; FALLBACK28-NEXT: movl %esi, 4(%ecx) -; FALLBACK28-NEXT: movl %edi, 24(%ecx) -; FALLBACK28-NEXT: movl %ebp, 16(%ecx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, 20(%ecx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, 8(%ecx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, 12(%ecx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, (%ecx) -; FALLBACK28-NEXT: addl $108, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: ashr_32bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $108, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK29-NEXT: movl 16(%ecx), %esi -; FALLBACK29-NEXT: movl 20(%ecx), %edi -; FALLBACK29-NEXT: movl 24(%ecx), %ebx -; FALLBACK29-NEXT: movl 28(%ecx), %edx -; FALLBACK29-NEXT: movzbl (%eax), %eax -; FALLBACK29-NEXT: movl %eax, %ecx -; FALLBACK29-NEXT: shlb $3, %cl -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: sarl $31, %edx -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: andb $28, %al -; FALLBACK29-NEXT: movzbl %al, %ebp -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl %edi, %esi -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %esi, 4(%ebp) -; FALLBACK29-NEXT: movl %ebx, 24(%ebp) -; FALLBACK29-NEXT: shrdl %cl, %edi, %edx -; FALLBACK29-NEXT: sarl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %edx, (%ebp) -; FALLBACK29-NEXT: addl $108, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: ashr_32bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $108, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movl 16(%ecx), %esi -; FALLBACK30-NEXT: movl 20(%ecx), %edi -; FALLBACK30-NEXT: movl 24(%ecx), %ebp -; FALLBACK30-NEXT: movl 28(%ecx), %ecx -; FALLBACK30-NEXT: movzbl (%eax), %edx -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: shlb $3, %bl -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: sarl $31, %ecx -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ebx, %eax -; FALLBACK30-NEXT: andb $28, %dl -; FALLBACK30-NEXT: movzbl %dl, %ecx -; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%ecx), %edx -; FALLBACK30-NEXT: movl %eax, %ebp -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: movl 36(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK30-NEXT: orl %edx, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %edx -; FALLBACK30-NEXT: shlxl %ebx, %edx, %edi -; FALLBACK30-NEXT: movl 44(%esp,%ecx), %edx -; FALLBACK30-NEXT: shrxl %ebp, %edx, %esi -; FALLBACK30-NEXT: orl %esi, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %edx, %edx -; FALLBACK30-NEXT: shlxl %ebx, %edx, %edi -; FALLBACK30-NEXT: movl 40(%esp,%ecx), %edx -; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebp, %edx, %esi -; FALLBACK30-NEXT: movl %ebp, %edx -; FALLBACK30-NEXT: orl %esi, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 56(%esp,%ecx), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK30-NEXT: movl 52(%esp,%ecx), %eax -; FALLBACK30-NEXT: shrxl %edx, %eax, %edi -; FALLBACK30-NEXT: orl %edi, %ebp -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: shrxl %edx, %esi, %eax -; FALLBACK30-NEXT: movl 60(%esp,%ecx), %ecx -; FALLBACK30-NEXT: leal (%ecx,%ecx), %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %eax -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; FALLBACK30-NEXT: orl %edx, %eax -; FALLBACK30-NEXT: sarxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %ecx, 28(%edx) -; FALLBACK30-NEXT: movl %eax, 4(%edx) -; FALLBACK30-NEXT: movl %esi, 24(%edx) -; FALLBACK30-NEXT: movl %edi, 16(%edx) -; FALLBACK30-NEXT: movl %ebp, 20(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 8(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, (%edx) -; FALLBACK30-NEXT: addl $108, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: ashr_32bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $108, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK31-NEXT: movl 16(%ecx), %esi -; FALLBACK31-NEXT: movl 20(%ecx), %edi -; FALLBACK31-NEXT: movl 24(%ecx), %ebx -; FALLBACK31-NEXT: movl 28(%ecx), %edx -; FALLBACK31-NEXT: movzbl (%eax), %eax -; FALLBACK31-NEXT: movl %eax, %ecx -; FALLBACK31-NEXT: shlb $3, %cl -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: sarl $31, %edx -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: andb $28, %al -; FALLBACK31-NEXT: movzbl %al, %ebx -; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %edi -; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi -; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl %ebx, 4(%eax) -; FALLBACK31-NEXT: movl %ebp, 24(%eax) -; FALLBACK31-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK31-NEXT: movl %ebx, 28(%eax) -; FALLBACK31-NEXT: movl %esi, 16(%eax) -; FALLBACK31-NEXT: movl %edi, 20(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: movl %esi, 8(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: movl %esi, 12(%eax) -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, (%eax) -; FALLBACK31-NEXT: addl $108, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: retl +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rsi,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %sil +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rsi,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%esi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esi), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: sarl $31, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%edi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%eax), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%eax,%eax), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: sarl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 24(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 16(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 20(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 8(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarl $31, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esi), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%esi), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%edx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esi), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esi), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%esi), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarl $31, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %dl, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%esp,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%ebp,%ebp), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, 32(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%esi), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %eax, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%esi), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxl %ebp, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 28(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 20(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ecx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarl $31, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %al, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxl %ecx, %edi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: sarl $31, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %al +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %cl, %dh +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: sarl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 4(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 24(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 16(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, (%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%ecx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%ecx), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%ecx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarl $31, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%ecx), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %bl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarl $31, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %dl, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %bl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%esp,%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%esp,%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxl %ebx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%ecx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarl $31, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 32(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes: +; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 16(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 20(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 24(%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 28(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $28, %al +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%edi), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %cl, %dh +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%edi), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 4(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 24(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 16(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, (%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 16(%ecx), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 20(%ecx), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 24(%ecx), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 28(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebp, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 16(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%ecx), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 28(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %bl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %dl, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %bl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 36(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 44(%esp,%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebp, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 40(%esp,%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebp, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%ecx,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ebx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 16(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%ecx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 28(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 44(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 40(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 32(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 36(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 %bitOff = shl i256 %byteOff, 3 @@ -10984,663 +8868,500 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { } define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: ashr_32bytes_dwordOff: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: movl %esi, %eax -; FALLBACK0-NEXT: shlb $5, %al -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: sarq $63, %rdi -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $6, %sil -; FALLBACK0-NEXT: movzbl %sil, %r9d -; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r11, %r8 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %rdi, %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: sarq %cl, %r9 -; FALLBACK0-NEXT: movq %r9, 24(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) -; FALLBACK0-NEXT: movq %rdi, (%rdx) -; FALLBACK0-NEXT: movq %r8, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: ashr_32bytes_dwordOff: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: movl %esi, %ecx -; FALLBACK1-NEXT: shlb $5, %cl -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: sarq $63, %rdi -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $6, %sil -; FALLBACK1-NEXT: movzbl %sil, %eax -; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi -; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi -; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK1-NEXT: movq %r8, %r9 -; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK1-NEXT: sarq %cl, %rax -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rax, 24(%rdx) -; FALLBACK1-NEXT: movq %rdi, (%rdx) -; FALLBACK1-NEXT: movq %r9, 8(%rdx) -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: ashr_32bytes_dwordOff: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: movl %esi, %eax -; FALLBACK2-NEXT: shlb $5, %al -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: sarq $63, %rdi -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movl %eax, %ecx -; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %esi -; FALLBACK2-NEXT: movq -64(%rsp,%rsi,4), %rdi -; FALLBACK2-NEXT: movq -56(%rsp,%rsi,4), %r8 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 -; FALLBACK2-NEXT: orq %r9, %r10 -; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r9, %rdi -; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 -; FALLBACK2-NEXT: movq -48(%rsp,%rsi,4), %rsi -; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 -; FALLBACK2-NEXT: shlxq %rax, %r9, %rax -; FALLBACK2-NEXT: orq %r8, %rax -; FALLBACK2-NEXT: sarxq %rcx, %rsi, %rcx -; FALLBACK2-NEXT: movq %rcx, 24(%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r10, 8(%rdx) -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: ashr_32bytes_dwordOff: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: movl %esi, %ecx -; FALLBACK3-NEXT: shlb $5, %cl -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: sarq $63, %rdi -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $6, %sil -; FALLBACK3-NEXT: movzbl %sil, %eax -; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi -; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi -; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK3-NEXT: movq %r8, %r9 -; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rax, 24(%rdx) -; FALLBACK3-NEXT: movq %rdi, (%rdx) -; FALLBACK3-NEXT: movq %r9, 8(%rdx) -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: ashr_32bytes_dwordOff: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movq 16(%rdi), %rcx -; FALLBACK4-NEXT: movq 24(%rdi), %rdi -; FALLBACK4-NEXT: movzbl (%rsi), %esi -; FALLBACK4-NEXT: movl %esi, %eax -; FALLBACK4-NEXT: shlb $5, %al -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: sarq $63, %rdi -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: andb $6, %sil -; FALLBACK4-NEXT: movzbl %sil, %r9d -; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK4-NEXT: movq %r10, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r11, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: addq %r10, %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: sarq %cl, %r9 -; FALLBACK4-NEXT: movq %r9, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) -; FALLBACK4-NEXT: movq %rdi, (%rdx) -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: ashr_32bytes_dwordOff: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movq 16(%rdi), %rax -; FALLBACK5-NEXT: movq 24(%rdi), %rdi -; FALLBACK5-NEXT: movzbl (%rsi), %esi -; FALLBACK5-NEXT: movl %esi, %ecx -; FALLBACK5-NEXT: shlb $5, %cl -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: sarq $63, %rdi -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: andb $6, %sil -; FALLBACK5-NEXT: movzbl %sil, %eax -; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK5-NEXT: movq %rdi, %r8 -; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK5-NEXT: movq %rax, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK5-NEXT: sarq %cl, %rsi -; FALLBACK5-NEXT: movq %r10, 8(%rdx) -; FALLBACK5-NEXT: movq %r8, 16(%rdx) -; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: ashr_32bytes_dwordOff: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movq 16(%rdi), %rcx -; FALLBACK6-NEXT: movq 24(%rdi), %rdi -; FALLBACK6-NEXT: movzbl (%rsi), %esi -; FALLBACK6-NEXT: movl %esi, %eax -; FALLBACK6-NEXT: shlb $5, %al -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: sarq $63, %rdi -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movl %eax, %ecx -; FALLBACK6-NEXT: andb $6, %sil -; FALLBACK6-NEXT: movzbl %sil, %esi -; FALLBACK6-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: movq -64(%rsp,%rsi,4), %r8 -; FALLBACK6-NEXT: movq -56(%rsp,%rsi,4), %r9 -; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 -; FALLBACK6-NEXT: orq %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rcx, %r9, %rdi -; FALLBACK6-NEXT: movq -48(%rsp,%rsi,4), %rsi -; FALLBACK6-NEXT: leaq (%rsi,%rsi), %r11 -; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 -; FALLBACK6-NEXT: orq %rdi, %r11 -; FALLBACK6-NEXT: shrxq %rcx, %r8, %rdi -; FALLBACK6-NEXT: addq %r9, %r9 -; FALLBACK6-NEXT: shlxq %rax, %r9, %rax -; FALLBACK6-NEXT: orq %rdi, %rax -; FALLBACK6-NEXT: sarxq %rcx, %rsi, %rcx -; FALLBACK6-NEXT: movq %rcx, 24(%rdx) -; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %r11, 16(%rdx) -; FALLBACK6-NEXT: movq %r10, (%rdx) -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: ashr_32bytes_dwordOff: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movq 16(%rdi), %rax -; FALLBACK7-NEXT: movq 24(%rdi), %rdi -; FALLBACK7-NEXT: movzbl (%rsi), %esi -; FALLBACK7-NEXT: movl %esi, %ecx -; FALLBACK7-NEXT: shlb $5, %cl -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: sarq $63, %rdi -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: andb $6, %sil -; FALLBACK7-NEXT: movzbl %sil, %eax -; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK7-NEXT: movq %rdi, %r8 -; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK7-NEXT: movq %rax, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax -; FALLBACK7-NEXT: movq %r10, 8(%rdx) -; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rax, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: ashr_32bytes_dwordOff: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK8-NEXT: movq 16(%rdi), %rcx -; FALLBACK8-NEXT: movq 24(%rdi), %rdi -; FALLBACK8-NEXT: movzbl (%rsi), %esi -; FALLBACK8-NEXT: movl %esi, %eax -; FALLBACK8-NEXT: shlb $5, %al -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: sarq $63, %rdi -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: andb $6, %sil -; FALLBACK8-NEXT: movzbl %sil, %r9d -; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK8-NEXT: movq %r10, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r11, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %r10, %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: sarq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) -; FALLBACK8-NEXT: movq %rdi, (%rdx) -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: ashr_32bytes_dwordOff: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK9-NEXT: movq 16(%rdi), %rax -; FALLBACK9-NEXT: movq 24(%rdi), %rdi -; FALLBACK9-NEXT: movzbl (%rsi), %esi -; FALLBACK9-NEXT: movl %esi, %ecx -; FALLBACK9-NEXT: shlb $5, %cl -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: sarq $63, %rdi -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: andb $6, %sil -; FALLBACK9-NEXT: movzbl %sil, %eax -; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK9-NEXT: movq %rdi, %r8 -; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK9-NEXT: movq %rax, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK9-NEXT: sarq %cl, %rsi -; FALLBACK9-NEXT: movq %r10, 8(%rdx) -; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: ashr_32bytes_dwordOff: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK10-NEXT: movq 16(%rdi), %rcx -; FALLBACK10-NEXT: movq 24(%rdi), %rdi -; FALLBACK10-NEXT: movzbl (%rsi), %esi -; FALLBACK10-NEXT: movl %esi, %eax -; FALLBACK10-NEXT: shlb $5, %al -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: sarq $63, %rdi -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movl %eax, %ecx -; FALLBACK10-NEXT: andb $6, %sil -; FALLBACK10-NEXT: movzbl %sil, %esi -; FALLBACK10-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: movq -64(%rsp,%rsi,4), %r8 -; FALLBACK10-NEXT: movq -56(%rsp,%rsi,4), %r9 -; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK10-NEXT: shlxq %rax, %r10, %r10 -; FALLBACK10-NEXT: orq %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rcx, %r9, %rdi -; FALLBACK10-NEXT: movq -48(%rsp,%rsi,4), %rsi -; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r11 -; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 -; FALLBACK10-NEXT: orq %rdi, %r11 -; FALLBACK10-NEXT: shrxq %rcx, %r8, %rdi -; FALLBACK10-NEXT: addq %r9, %r9 -; FALLBACK10-NEXT: shlxq %rax, %r9, %rax -; FALLBACK10-NEXT: orq %rdi, %rax -; FALLBACK10-NEXT: sarxq %rcx, %rsi, %rcx -; FALLBACK10-NEXT: movq %rcx, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %r11, 16(%rdx) -; FALLBACK10-NEXT: movq %r10, (%rdx) -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: ashr_32bytes_dwordOff: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK11-NEXT: movq 16(%rdi), %rax -; FALLBACK11-NEXT: movq 24(%rdi), %rdi -; FALLBACK11-NEXT: movzbl (%rsi), %esi -; FALLBACK11-NEXT: movl %esi, %ecx -; FALLBACK11-NEXT: shlb $5, %cl -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: sarq $63, %rdi -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: andb $6, %sil -; FALLBACK11-NEXT: movzbl %sil, %eax -; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK11-NEXT: movq %rdi, %r8 -; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK11-NEXT: movq %rax, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax -; FALLBACK11-NEXT: movq %r10, 8(%rdx) -; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rax, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: ashr_32bytes_dwordOff: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK12-NEXT: movq 16(%rdi), %rcx -; FALLBACK12-NEXT: movq 24(%rdi), %rdi -; FALLBACK12-NEXT: movzbl (%rsi), %esi -; FALLBACK12-NEXT: movl %esi, %eax -; FALLBACK12-NEXT: shlb $5, %al -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: sarq $63, %rdi -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: andb $6, %sil -; FALLBACK12-NEXT: movzbl %sil, %r9d -; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK12-NEXT: movq %r10, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r11, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %r10, %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: sarq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) -; FALLBACK12-NEXT: movq %rdi, (%rdx) -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: ashr_32bytes_dwordOff: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK13-NEXT: movq 16(%rdi), %rax -; FALLBACK13-NEXT: movq 24(%rdi), %rdi -; FALLBACK13-NEXT: movzbl (%rsi), %esi -; FALLBACK13-NEXT: movl %esi, %ecx -; FALLBACK13-NEXT: shlb $5, %cl -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: sarq $63, %rdi -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: andb $6, %sil -; FALLBACK13-NEXT: movzbl %sil, %eax -; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK13-NEXT: movq %rdi, %r8 -; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK13-NEXT: movq %rax, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK13-NEXT: sarq %cl, %rsi -; FALLBACK13-NEXT: movq %r10, 8(%rdx) -; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: ashr_32bytes_dwordOff: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK14-NEXT: movq 16(%rdi), %rcx -; FALLBACK14-NEXT: movq 24(%rdi), %rdi -; FALLBACK14-NEXT: movzbl (%rsi), %esi -; FALLBACK14-NEXT: movl %esi, %eax -; FALLBACK14-NEXT: shlb $5, %al -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: sarq $63, %rdi -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movl %eax, %ecx -; FALLBACK14-NEXT: andb $6, %sil -; FALLBACK14-NEXT: movzbl %sil, %esi -; FALLBACK14-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: movq -64(%rsp,%rsi,4), %r8 -; FALLBACK14-NEXT: movq -56(%rsp,%rsi,4), %r9 -; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 -; FALLBACK14-NEXT: shlxq %rax, %r10, %r10 -; FALLBACK14-NEXT: orq %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rcx, %r9, %rdi -; FALLBACK14-NEXT: movq -48(%rsp,%rsi,4), %rsi -; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r11 -; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 -; FALLBACK14-NEXT: orq %rdi, %r11 -; FALLBACK14-NEXT: shrxq %rcx, %r8, %rdi -; FALLBACK14-NEXT: addq %r9, %r9 -; FALLBACK14-NEXT: shlxq %rax, %r9, %rax -; FALLBACK14-NEXT: orq %rdi, %rax -; FALLBACK14-NEXT: sarxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: movq %rcx, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %r11, 16(%rdx) -; FALLBACK14-NEXT: movq %r10, (%rdx) -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: ashr_32bytes_dwordOff: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK15-NEXT: movq 16(%rdi), %rax -; FALLBACK15-NEXT: movq 24(%rdi), %rdi -; FALLBACK15-NEXT: movzbl (%rsi), %esi -; FALLBACK15-NEXT: movl %esi, %ecx -; FALLBACK15-NEXT: shlb $5, %cl -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: sarq $63, %rdi -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: andb $6, %sil -; FALLBACK15-NEXT: movzbl %sil, %eax -; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK15-NEXT: movq %rdi, %r8 -; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK15-NEXT: movq %rax, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax -; FALLBACK15-NEXT: movq %r10, 8(%rdx) -; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rax, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: retq +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9,4), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9,4), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9,4), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi,4), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi,4), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi,4), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9,4), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9,4), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rsi,4), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rsi,4), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rsi,4), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rsi,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $6, %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9,4), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9,4), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $6, %sil +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rsi,4), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rsi,4), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rsi,4), %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rsi,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %sil +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq ; ; X86-SSE2-LABEL: ashr_32bytes_dwordOff: ; X86-SSE2: # %bb.0: @@ -12023,3629 +9744,3629 @@ define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no } define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: lshr_64bytes: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %r15 -; FALLBACK0-NEXT: pushq %r14 -; FALLBACK0-NEXT: pushq %r13 -; FALLBACK0-NEXT: pushq %r12 -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rax -; FALLBACK0-NEXT: movq 8(%rdi), %rcx -; FALLBACK0-NEXT: movq 16(%rdi), %r8 -; FALLBACK0-NEXT: movq 24(%rdi), %r9 -; FALLBACK0-NEXT: movq 32(%rdi), %r10 -; FALLBACK0-NEXT: movq 40(%rdi), %r11 -; FALLBACK0-NEXT: movq 48(%rdi), %rbx -; FALLBACK0-NEXT: movq 56(%rdi), %r14 -; FALLBACK0-NEXT: movl (%rsi), %edi -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: leal (,%rdi,8), %eax -; FALLBACK0-NEXT: andl $56, %eax -; FALLBACK0-NEXT: andl $56, %edi -; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8 -; FALLBACK0-NEXT: movq %r8, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r9 -; FALLBACK0-NEXT: orq %r11, %r9 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %r8, %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r10, %r8 -; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK0-NEXT: movq %r10, %r15 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r15 -; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14 -; FALLBACK0-NEXT: leaq (%r14,%r14), %r11 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: orq %r15, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: addq %r10, %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx -; FALLBACK0-NEXT: movq %rbx, %r12 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r12 -; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13 -; FALLBACK0-NEXT: leaq (%r13,%r13), %r15 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r15 -; FALLBACK0-NEXT: orq %r12, %r15 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r14 -; FALLBACK0-NEXT: addq %rbx, %rbx -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rbx -; FALLBACK0-NEXT: orq %r14, %rbx -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r13 -; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r14 -; FALLBACK0-NEXT: orq %r13, %r14 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rdi -; FALLBACK0-NEXT: movq %rdi, 56(%rdx) -; FALLBACK0-NEXT: movq %r14, 48(%rdx) -; FALLBACK0-NEXT: movq %rbx, 32(%rdx) -; FALLBACK0-NEXT: movq %r15, 40(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) -; FALLBACK0-NEXT: movq %r11, 24(%rdx) -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: popq %r12 -; FALLBACK0-NEXT: popq %r13 -; FALLBACK0-NEXT: popq %r14 -; FALLBACK0-NEXT: popq %r15 -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: lshr_64bytes: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: pushq %r15 -; FALLBACK1-NEXT: pushq %r14 -; FALLBACK1-NEXT: pushq %rbx -; FALLBACK1-NEXT: movq (%rdi), %rcx -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %r10 -; FALLBACK1-NEXT: movq 32(%rdi), %r11 -; FALLBACK1-NEXT: movq 40(%rdi), %rbx -; FALLBACK1-NEXT: movq 48(%rdi), %r14 -; FALLBACK1-NEXT: movq 56(%rdi), %rdi -; FALLBACK1-NEXT: movl (%rsi), %eax -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: leal (,%rax,8), %ecx -; FALLBACK1-NEXT: andl $56, %ecx -; FALLBACK1-NEXT: andl $56, %eax -; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi -; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9 -; FALLBACK1-NEXT: movq %r9, %r8 -; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8 -; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK1-NEXT: movq %r11, %rbx -; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx -; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11 -; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK1-NEXT: movq %r14, %r15 -; FALLBACK1-NEXT: shrdq %cl, %r11, %r15 -; FALLBACK1-NEXT: shrdq %cl, %r14, %r10 -; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi -; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: shrq %cl, %rax -; FALLBACK1-NEXT: movq %r11, 48(%rdx) -; FALLBACK1-NEXT: movq %rax, 56(%rdx) -; FALLBACK1-NEXT: movq %r10, 32(%rdx) -; FALLBACK1-NEXT: movq %r15, 40(%rdx) -; FALLBACK1-NEXT: movq %rdi, 16(%rdx) -; FALLBACK1-NEXT: movq %rbx, 24(%rdx) -; FALLBACK1-NEXT: movq %rsi, (%rdx) -; FALLBACK1-NEXT: movq %r8, 8(%rdx) -; FALLBACK1-NEXT: popq %rbx -; FALLBACK1-NEXT: popq %r14 -; FALLBACK1-NEXT: popq %r15 -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: lshr_64bytes: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %r15 -; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r12 -; FALLBACK2-NEXT: pushq %rbx -; FALLBACK2-NEXT: pushq %rax -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %r10 -; FALLBACK2-NEXT: movq 32(%rdi), %r11 -; FALLBACK2-NEXT: movq 40(%rdi), %rbx -; FALLBACK2-NEXT: movq 48(%rdi), %r14 -; FALLBACK2-NEXT: movq 56(%rdi), %rdi -; FALLBACK2-NEXT: movl (%rsi), %eax -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: leal (,%rax,8), %ecx -; FALLBACK2-NEXT: andl $56, %ecx -; FALLBACK2-NEXT: movl %ecx, %esi -; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %r8 -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rsi, %r8, %r9 -; FALLBACK2-NEXT: notb %cl -; FALLBACK2-NEXT: leaq (%r10,%r10), %rdi -; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rdi -; FALLBACK2-NEXT: orq %r9, %rdi -; FALLBACK2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 -; FALLBACK2-NEXT: addq %r8, %r8 -; FALLBACK2-NEXT: shlxq %rcx, %r8, %r8 -; FALLBACK2-NEXT: orq %r9, %r8 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK2-NEXT: shrxq %rsi, %r11, %rbx -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r14 -; FALLBACK2-NEXT: leaq (%r14,%r14), %r9 -; FALLBACK2-NEXT: shlxq %rcx, %r9, %r9 -; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: shrxq %rsi, %r10, %r10 -; FALLBACK2-NEXT: addq %r11, %r11 -; FALLBACK2-NEXT: shlxq %rcx, %r11, %r11 -; FALLBACK2-NEXT: orq %r10, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rsi, %r10, %rbx -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r15 -; FALLBACK2-NEXT: leaq (%r15,%r15), %r12 -; FALLBACK2-NEXT: shlxq %rcx, %r12, %r12 -; FALLBACK2-NEXT: orq %rbx, %r12 -; FALLBACK2-NEXT: shrxq %rsi, %r14, %rbx -; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %rcx, %r10, %r10 -; FALLBACK2-NEXT: orq %rbx, %r10 -; FALLBACK2-NEXT: shrxq %rsi, %r15, %rbx -; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: leaq (%rax,%rax), %r14 -; FALLBACK2-NEXT: shlxq %rcx, %r14, %rcx -; FALLBACK2-NEXT: orq %rbx, %rcx -; FALLBACK2-NEXT: shrxq %rsi, %rax, %rax -; FALLBACK2-NEXT: movq %rax, 56(%rdx) -; FALLBACK2-NEXT: movq %rcx, 48(%rdx) -; FALLBACK2-NEXT: movq %r10, 32(%rdx) -; FALLBACK2-NEXT: movq %r12, 40(%rdx) -; FALLBACK2-NEXT: movq %r11, 16(%rdx) -; FALLBACK2-NEXT: movq %r9, 24(%rdx) -; FALLBACK2-NEXT: movq %r8, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) -; FALLBACK2-NEXT: addq $8, %rsp -; FALLBACK2-NEXT: popq %rbx -; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r14 -; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: lshr_64bytes: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: pushq %r15 -; FALLBACK3-NEXT: pushq %r14 -; FALLBACK3-NEXT: pushq %rbx -; FALLBACK3-NEXT: movq (%rdi), %rcx -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %r10 -; FALLBACK3-NEXT: movq 32(%rdi), %r11 -; FALLBACK3-NEXT: movq 40(%rdi), %rbx -; FALLBACK3-NEXT: movq 48(%rdi), %r14 -; FALLBACK3-NEXT: movq 56(%rdi), %rdi -; FALLBACK3-NEXT: movl (%rsi), %eax -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: leal (,%rax,8), %ecx -; FALLBACK3-NEXT: andl $56, %ecx -; FALLBACK3-NEXT: andl $56, %eax -; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi -; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9 -; FALLBACK3-NEXT: movq %r9, %r8 -; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8 -; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK3-NEXT: movq %r11, %rbx -; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx -; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11 -; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK3-NEXT: movq %r14, %r15 -; FALLBACK3-NEXT: shrdq %cl, %r11, %r15 -; FALLBACK3-NEXT: shrdq %cl, %r14, %r10 -; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax -; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi -; FALLBACK3-NEXT: movq %r11, 48(%rdx) -; FALLBACK3-NEXT: movq %r10, 32(%rdx) -; FALLBACK3-NEXT: movq %r15, 40(%rdx) -; FALLBACK3-NEXT: movq %rdi, 16(%rdx) -; FALLBACK3-NEXT: movq %rbx, 24(%rdx) -; FALLBACK3-NEXT: movq %rsi, (%rdx) -; FALLBACK3-NEXT: movq %r8, 8(%rdx) -; FALLBACK3-NEXT: movq %rax, 56(%rdx) -; FALLBACK3-NEXT: popq %rbx -; FALLBACK3-NEXT: popq %r14 -; FALLBACK3-NEXT: popq %r15 -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: lshr_64bytes: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %rbp -; FALLBACK4-NEXT: pushq %r15 -; FALLBACK4-NEXT: pushq %r14 -; FALLBACK4-NEXT: pushq %r13 -; FALLBACK4-NEXT: pushq %r12 -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: pushq %rax -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK4-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK4-NEXT: movl (%rsi), %r8d -; FALLBACK4-NEXT: xorps %xmm4, %xmm4 -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: leal (,%r8,8), %eax -; FALLBACK4-NEXT: andl $56, %eax -; FALLBACK4-NEXT: andl $56, %r8d -; FALLBACK4-NEXT: movq -128(%rsp,%r8), %r10 -; FALLBACK4-NEXT: movq -120(%rsp,%r8), %r9 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r9,%r9), %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -104(%rsp,%r8), %r10 -; FALLBACK4-NEXT: movq %r10, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbx -; FALLBACK4-NEXT: movq -96(%rsp,%r8), %r12 -; FALLBACK4-NEXT: leaq (%r12,%r12), %r11 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r11 -; FALLBACK4-NEXT: orq %rbx, %r11 -; FALLBACK4-NEXT: movq -112(%rsp,%r8), %rbx -; FALLBACK4-NEXT: movq %rbx, %r14 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r14 -; FALLBACK4-NEXT: addq %r10, %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r14, %r10 -; FALLBACK4-NEXT: movq -88(%rsp,%r8), %r14 -; FALLBACK4-NEXT: movq %r14, %r13 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r13 -; FALLBACK4-NEXT: movq -80(%rsp,%r8), %rbp -; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r15 -; FALLBACK4-NEXT: orq %r13, %r15 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r12 -; FALLBACK4-NEXT: addq %r14, %r14 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r14 -; FALLBACK4-NEXT: orq %r12, %r14 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbp -; FALLBACK4-NEXT: movq -72(%rsp,%r8), %r8 -; FALLBACK4-NEXT: leaq (%r8,%r8), %r12 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r12 -; FALLBACK4-NEXT: orq %rbp, %r12 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r9 -; FALLBACK4-NEXT: addq %rbx, %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r9, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: movq %r8, 56(%rdx) -; FALLBACK4-NEXT: movq %rbx, 8(%rdx) -; FALLBACK4-NEXT: movq %r12, 48(%rdx) -; FALLBACK4-NEXT: movq %r14, 32(%rdx) -; FALLBACK4-NEXT: movq %r15, 40(%rdx) -; FALLBACK4-NEXT: movq %r10, 16(%rdx) -; FALLBACK4-NEXT: movq %r11, 24(%rdx) -; FALLBACK4-NEXT: movq %rdi, (%rdx) -; FALLBACK4-NEXT: addq $8, %rsp -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: popq %r12 -; FALLBACK4-NEXT: popq %r13 -; FALLBACK4-NEXT: popq %r14 -; FALLBACK4-NEXT: popq %r15 -; FALLBACK4-NEXT: popq %rbp -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: lshr_64bytes: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: pushq %r15 -; FALLBACK5-NEXT: pushq %r14 -; FALLBACK5-NEXT: pushq %rbx -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK5-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK5-NEXT: movl (%rsi), %eax -; FALLBACK5-NEXT: xorps %xmm4, %xmm4 -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: andl $56, %ecx -; FALLBACK5-NEXT: andl $56, %eax -; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq %r9, %rsi -; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK5-NEXT: movq %r10, %r8 -; FALLBACK5-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK5-NEXT: movq %r11, %rbx -; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK5-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK5-NEXT: movq %rax, %r15 -; FALLBACK5-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: shrq %cl, %r11 -; FALLBACK5-NEXT: movq %r15, 8(%rdx) -; FALLBACK5-NEXT: movq %r9, 48(%rdx) -; FALLBACK5-NEXT: movq %r11, 56(%rdx) -; FALLBACK5-NEXT: movq %rdi, 32(%rdx) -; FALLBACK5-NEXT: movq %rbx, 40(%rdx) -; FALLBACK5-NEXT: movq %r8, 16(%rdx) -; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r14, (%rdx) -; FALLBACK5-NEXT: popq %rbx -; FALLBACK5-NEXT: popq %r14 -; FALLBACK5-NEXT: popq %r15 -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: lshr_64bytes: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %r15 -; FALLBACK6-NEXT: pushq %r14 -; FALLBACK6-NEXT: pushq %r13 -; FALLBACK6-NEXT: pushq %r12 -; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK6-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK6-NEXT: movl (%rsi), %eax -; FALLBACK6-NEXT: xorps %xmm4, %xmm4 -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %ecx -; FALLBACK6-NEXT: andl $56, %ecx -; FALLBACK6-NEXT: movl %ecx, %esi -; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 -; FALLBACK6-NEXT: notb %cl -; FALLBACK6-NEXT: movq -120(%rsp,%rax), %r10 -; FALLBACK6-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK6-NEXT: leaq (%r10,%r10), %rdi -; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rdi -; FALLBACK6-NEXT: orq %r8, %rdi -; FALLBACK6-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK6-NEXT: shrxq %rsi, %r11, %rbx -; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r14 -; FALLBACK6-NEXT: leaq (%r14,%r14), %r8 -; FALLBACK6-NEXT: shlxq %rcx, %r8, %r8 -; FALLBACK6-NEXT: orq %rbx, %r8 -; FALLBACK6-NEXT: shrxq %rsi, %r9, %rbx -; FALLBACK6-NEXT: addq %r11, %r11 -; FALLBACK6-NEXT: shlxq %rcx, %r11, %r11 -; FALLBACK6-NEXT: orq %rbx, %r11 -; FALLBACK6-NEXT: movq -88(%rsp,%rax), %rbx -; FALLBACK6-NEXT: shrxq %rsi, %rbx, %r15 -; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK6-NEXT: leaq (%r12,%r12), %r13 -; FALLBACK6-NEXT: shlxq %rcx, %r13, %r13 -; FALLBACK6-NEXT: orq %r15, %r13 -; FALLBACK6-NEXT: shrxq %rsi, %r14, %r14 -; FALLBACK6-NEXT: addq %rbx, %rbx -; FALLBACK6-NEXT: shlxq %rcx, %rbx, %rbx -; FALLBACK6-NEXT: orq %r14, %rbx -; FALLBACK6-NEXT: shrxq %rsi, %r12, %r14 -; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK6-NEXT: leaq (%rax,%rax), %r15 -; FALLBACK6-NEXT: shlxq %rcx, %r15, %r15 -; FALLBACK6-NEXT: orq %r14, %r15 -; FALLBACK6-NEXT: shrxq %rsi, %r10, %r10 -; FALLBACK6-NEXT: addq %r9, %r9 -; FALLBACK6-NEXT: shlxq %rcx, %r9, %rcx -; FALLBACK6-NEXT: orq %r10, %rcx -; FALLBACK6-NEXT: shrxq %rsi, %rax, %rax -; FALLBACK6-NEXT: movq %rax, 56(%rdx) -; FALLBACK6-NEXT: movq %rcx, 8(%rdx) -; FALLBACK6-NEXT: movq %r15, 48(%rdx) -; FALLBACK6-NEXT: movq %rbx, 32(%rdx) -; FALLBACK6-NEXT: movq %r13, 40(%rdx) -; FALLBACK6-NEXT: movq %r11, 16(%rdx) -; FALLBACK6-NEXT: movq %r8, 24(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) -; FALLBACK6-NEXT: popq %rbx -; FALLBACK6-NEXT: popq %r12 -; FALLBACK6-NEXT: popq %r13 -; FALLBACK6-NEXT: popq %r14 -; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: lshr_64bytes: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: pushq %r15 -; FALLBACK7-NEXT: pushq %r14 -; FALLBACK7-NEXT: pushq %rbx -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK7-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK7-NEXT: movl (%rsi), %eax -; FALLBACK7-NEXT: xorps %xmm4, %xmm4 -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: andl $56, %ecx -; FALLBACK7-NEXT: andl $56, %eax -; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq %r9, %rsi -; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK7-NEXT: movq %r10, %r8 -; FALLBACK7-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK7-NEXT: movq %r11, %rbx -; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK7-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK7-NEXT: movq %rax, %r15 -; FALLBACK7-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK7-NEXT: shrxq %rcx, %r11, %r10 -; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK7-NEXT: movq %r15, 8(%rdx) -; FALLBACK7-NEXT: movq %r9, 48(%rdx) -; FALLBACK7-NEXT: movq %rdi, 32(%rdx) -; FALLBACK7-NEXT: movq %rbx, 40(%rdx) -; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rsi, 24(%rdx) -; FALLBACK7-NEXT: movq %r14, (%rdx) -; FALLBACK7-NEXT: movq %r10, 56(%rdx) -; FALLBACK7-NEXT: popq %rbx -; FALLBACK7-NEXT: popq %r14 -; FALLBACK7-NEXT: popq %r15 -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: lshr_64bytes: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %rbp -; FALLBACK8-NEXT: pushq %r15 -; FALLBACK8-NEXT: pushq %r14 -; FALLBACK8-NEXT: pushq %r13 -; FALLBACK8-NEXT: pushq %r12 -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: pushq %rax -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK8-NEXT: movl (%rsi), %r9d -; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: leal (,%r9,8), %eax -; FALLBACK8-NEXT: andl $56, %eax -; FALLBACK8-NEXT: andl $56, %r9d -; FALLBACK8-NEXT: movq -128(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq -120(%rsp,%r9), %r8 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -104(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq %r10, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbx -; FALLBACK8-NEXT: movq -96(%rsp,%r9), %r12 -; FALLBACK8-NEXT: leaq (%r12,%r12), %r11 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r11 -; FALLBACK8-NEXT: orq %rbx, %r11 -; FALLBACK8-NEXT: movq -112(%rsp,%r9), %rbx -; FALLBACK8-NEXT: movq %rbx, %r14 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r14 -; FALLBACK8-NEXT: addq %r10, %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r14, %r10 -; FALLBACK8-NEXT: movq -88(%rsp,%r9), %r14 -; FALLBACK8-NEXT: movq %r14, %r13 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r13 -; FALLBACK8-NEXT: movq -80(%rsp,%r9), %rbp -; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r15 -; FALLBACK8-NEXT: orq %r13, %r15 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r12 -; FALLBACK8-NEXT: addq %r14, %r14 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r14 -; FALLBACK8-NEXT: orq %r12, %r14 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbp -; FALLBACK8-NEXT: movq -72(%rsp,%r9), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %r12 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r12 -; FALLBACK8-NEXT: orq %rbp, %r12 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %rbx, %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r8, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, 56(%rdx) -; FALLBACK8-NEXT: movq %rbx, 8(%rdx) -; FALLBACK8-NEXT: movq %r12, 48(%rdx) -; FALLBACK8-NEXT: movq %r14, 32(%rdx) -; FALLBACK8-NEXT: movq %r15, 40(%rdx) -; FALLBACK8-NEXT: movq %r10, 16(%rdx) -; FALLBACK8-NEXT: movq %r11, 24(%rdx) -; FALLBACK8-NEXT: movq %rdi, (%rdx) -; FALLBACK8-NEXT: addq $8, %rsp -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: popq %r12 -; FALLBACK8-NEXT: popq %r13 -; FALLBACK8-NEXT: popq %r14 -; FALLBACK8-NEXT: popq %r15 -; FALLBACK8-NEXT: popq %rbp -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: lshr_64bytes: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: pushq %r15 -; FALLBACK9-NEXT: pushq %r14 -; FALLBACK9-NEXT: pushq %rbx -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK9-NEXT: movl (%rsi), %eax -; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: andl $56, %ecx -; FALLBACK9-NEXT: andl $56, %eax -; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq %r9, %rsi -; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK9-NEXT: movq %r10, %r8 -; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK9-NEXT: movq %r11, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r15 -; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: shrq %cl, %r11 -; FALLBACK9-NEXT: movq %r15, 8(%rdx) -; FALLBACK9-NEXT: movq %r9, 48(%rdx) -; FALLBACK9-NEXT: movq %r11, 56(%rdx) -; FALLBACK9-NEXT: movq %rdi, 32(%rdx) -; FALLBACK9-NEXT: movq %rbx, 40(%rdx) -; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r14, (%rdx) -; FALLBACK9-NEXT: popq %rbx -; FALLBACK9-NEXT: popq %r14 -; FALLBACK9-NEXT: popq %r15 -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: lshr_64bytes: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %r15 -; FALLBACK10-NEXT: pushq %r14 -; FALLBACK10-NEXT: pushq %r13 -; FALLBACK10-NEXT: pushq %r12 -; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK10-NEXT: movl (%rsi), %esi -; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rsi,8), %eax -; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: movl %eax, %ecx -; FALLBACK10-NEXT: andl $56, %esi -; FALLBACK10-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: movq -120(%rsp,%rsi), %r10 -; FALLBACK10-NEXT: movq -112(%rsp,%rsi), %r9 -; FALLBACK10-NEXT: leaq (%r10,%r10), %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %r8, %rdi -; FALLBACK10-NEXT: movq -104(%rsp,%rsi), %r11 -; FALLBACK10-NEXT: shrxq %rcx, %r11, %rbx -; FALLBACK10-NEXT: movq -96(%rsp,%rsi), %r14 -; FALLBACK10-NEXT: leaq (%r14,%r14), %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %r8 -; FALLBACK10-NEXT: orq %rbx, %r8 -; FALLBACK10-NEXT: shrxq %rcx, %r9, %rbx -; FALLBACK10-NEXT: addq %r11, %r11 -; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 -; FALLBACK10-NEXT: orq %rbx, %r11 -; FALLBACK10-NEXT: movq -88(%rsp,%rsi), %rbx -; FALLBACK10-NEXT: shrxq %rcx, %rbx, %r15 -; FALLBACK10-NEXT: movq -80(%rsp,%rsi), %r12 -; FALLBACK10-NEXT: leaq (%r12,%r12), %r13 -; FALLBACK10-NEXT: shlxq %rax, %r13, %r13 -; FALLBACK10-NEXT: orq %r15, %r13 -; FALLBACK10-NEXT: shrxq %rcx, %r14, %r14 -; FALLBACK10-NEXT: addq %rbx, %rbx -; FALLBACK10-NEXT: shlxq %rax, %rbx, %rbx -; FALLBACK10-NEXT: orq %r14, %rbx -; FALLBACK10-NEXT: shrxq %rcx, %r12, %r14 -; FALLBACK10-NEXT: movq -72(%rsp,%rsi), %rsi -; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r15 -; FALLBACK10-NEXT: shlxq %rax, %r15, %r15 -; FALLBACK10-NEXT: orq %r14, %r15 -; FALLBACK10-NEXT: shrxq %rcx, %r10, %r10 -; FALLBACK10-NEXT: addq %r9, %r9 -; FALLBACK10-NEXT: shlxq %rax, %r9, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: shrxq %rcx, %rsi, %rcx -; FALLBACK10-NEXT: movq %rcx, 56(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %r15, 48(%rdx) -; FALLBACK10-NEXT: movq %rbx, 32(%rdx) -; FALLBACK10-NEXT: movq %r13, 40(%rdx) -; FALLBACK10-NEXT: movq %r11, 16(%rdx) -; FALLBACK10-NEXT: movq %r8, 24(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) -; FALLBACK10-NEXT: popq %rbx -; FALLBACK10-NEXT: popq %r12 -; FALLBACK10-NEXT: popq %r13 -; FALLBACK10-NEXT: popq %r14 -; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: lshr_64bytes: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: pushq %r15 -; FALLBACK11-NEXT: pushq %r14 -; FALLBACK11-NEXT: pushq %rbx -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK11-NEXT: movl (%rsi), %eax -; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: andl $56, %ecx -; FALLBACK11-NEXT: andl $56, %eax -; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq %r9, %rsi -; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK11-NEXT: movq %r10, %r8 -; FALLBACK11-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK11-NEXT: movq %r11, %rbx -; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK11-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r15 -; FALLBACK11-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK11-NEXT: shrxq %rcx, %r11, %r10 -; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK11-NEXT: movq %r15, 8(%rdx) -; FALLBACK11-NEXT: movq %r9, 48(%rdx) -; FALLBACK11-NEXT: movq %rdi, 32(%rdx) -; FALLBACK11-NEXT: movq %rbx, 40(%rdx) -; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rsi, 24(%rdx) -; FALLBACK11-NEXT: movq %r14, (%rdx) -; FALLBACK11-NEXT: movq %r10, 56(%rdx) -; FALLBACK11-NEXT: popq %rbx -; FALLBACK11-NEXT: popq %r14 -; FALLBACK11-NEXT: popq %r15 -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: lshr_64bytes: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %rbp -; FALLBACK12-NEXT: pushq %r15 -; FALLBACK12-NEXT: pushq %r14 -; FALLBACK12-NEXT: pushq %r13 -; FALLBACK12-NEXT: pushq %r12 -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: pushq %rax -; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK12-NEXT: movl (%rsi), %r9d -; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: leal (,%r9,8), %eax -; FALLBACK12-NEXT: andl $56, %eax -; FALLBACK12-NEXT: andl $56, %r9d -; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12 -; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r11 -; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx -; FALLBACK12-NEXT: movq %rbx, %r14 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r14 -; FALLBACK12-NEXT: addq %r10, %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r14, %r10 -; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14 -; FALLBACK12-NEXT: movq %r14, %r13 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r13 -; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp -; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r15 -; FALLBACK12-NEXT: orq %r13, %r15 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r12 -; FALLBACK12-NEXT: addq %r14, %r14 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r14 -; FALLBACK12-NEXT: orq %r12, %r14 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbp -; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %r12 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r12 -; FALLBACK12-NEXT: orq %rbp, %r12 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %rbx, %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r8, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 56(%rdx) -; FALLBACK12-NEXT: movq %rbx, 8(%rdx) -; FALLBACK12-NEXT: movq %r12, 48(%rdx) -; FALLBACK12-NEXT: movq %r14, 32(%rdx) -; FALLBACK12-NEXT: movq %r15, 40(%rdx) -; FALLBACK12-NEXT: movq %r10, 16(%rdx) -; FALLBACK12-NEXT: movq %r11, 24(%rdx) -; FALLBACK12-NEXT: movq %rdi, (%rdx) -; FALLBACK12-NEXT: addq $8, %rsp -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: popq %r12 -; FALLBACK12-NEXT: popq %r13 -; FALLBACK12-NEXT: popq %r14 -; FALLBACK12-NEXT: popq %r15 -; FALLBACK12-NEXT: popq %rbp -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: lshr_64bytes: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: pushq %r15 -; FALLBACK13-NEXT: pushq %r14 -; FALLBACK13-NEXT: pushq %rbx -; FALLBACK13-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK13-NEXT: movl (%rsi), %edi -; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: leal (,%rdi,8), %ecx -; FALLBACK13-NEXT: andl $56, %ecx -; FALLBACK13-NEXT: andl $56, %edi -; FALLBACK13-NEXT: movq -96(%rsp,%rdi), %rsi -; FALLBACK13-NEXT: movq -104(%rsp,%rdi), %r9 -; FALLBACK13-NEXT: movq %r9, %rax -; FALLBACK13-NEXT: shrdq %cl, %rsi, %rax -; FALLBACK13-NEXT: movq -112(%rsp,%rdi), %r10 -; FALLBACK13-NEXT: movq %r10, %r8 -; FALLBACK13-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK13-NEXT: movq -80(%rsp,%rdi), %r9 -; FALLBACK13-NEXT: movq -88(%rsp,%rdi), %r11 -; FALLBACK13-NEXT: movq %r11, %rbx -; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK13-NEXT: shrdq %cl, %r11, %rsi -; FALLBACK13-NEXT: movq -72(%rsp,%rdi), %r11 -; FALLBACK13-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK13-NEXT: movq -128(%rsp,%rdi), %r14 -; FALLBACK13-NEXT: movq -120(%rsp,%rdi), %rdi -; FALLBACK13-NEXT: movq %rdi, %r15 -; FALLBACK13-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK13-NEXT: shrdq %cl, %rdi, %r14 -; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: shrq %cl, %r11 -; FALLBACK13-NEXT: movq %r15, 8(%rdx) -; FALLBACK13-NEXT: movq %r9, 48(%rdx) -; FALLBACK13-NEXT: movq %r11, 56(%rdx) -; FALLBACK13-NEXT: movq %rsi, 32(%rdx) -; FALLBACK13-NEXT: movq %rbx, 40(%rdx) -; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rax, 24(%rdx) -; FALLBACK13-NEXT: movq %r14, (%rdx) -; FALLBACK13-NEXT: popq %rbx -; FALLBACK13-NEXT: popq %r14 -; FALLBACK13-NEXT: popq %r15 -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: lshr_64bytes: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %r15 -; FALLBACK14-NEXT: pushq %r14 -; FALLBACK14-NEXT: pushq %r13 -; FALLBACK14-NEXT: pushq %r12 -; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK14-NEXT: movl (%rsi), %esi -; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rsi,8), %eax -; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: movl %eax, %ecx -; FALLBACK14-NEXT: andl $56, %esi -; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %r10 -; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %r9 -; FALLBACK14-NEXT: leaq (%r10,%r10), %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %r8, %rdi -; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %r11 -; FALLBACK14-NEXT: shrxq %rcx, %r11, %rbx -; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r14 -; FALLBACK14-NEXT: leaq (%r14,%r14), %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %r8 -; FALLBACK14-NEXT: orq %rbx, %r8 -; FALLBACK14-NEXT: shrxq %rcx, %r9, %rbx -; FALLBACK14-NEXT: addq %r11, %r11 -; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 -; FALLBACK14-NEXT: orq %rbx, %r11 -; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %rbx -; FALLBACK14-NEXT: shrxq %rcx, %rbx, %r15 -; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12 -; FALLBACK14-NEXT: leaq (%r12,%r12), %r13 -; FALLBACK14-NEXT: shlxq %rax, %r13, %r13 -; FALLBACK14-NEXT: orq %r15, %r13 -; FALLBACK14-NEXT: shrxq %rcx, %r14, %r14 -; FALLBACK14-NEXT: addq %rbx, %rbx -; FALLBACK14-NEXT: shlxq %rax, %rbx, %rbx -; FALLBACK14-NEXT: orq %r14, %rbx -; FALLBACK14-NEXT: shrxq %rcx, %r12, %r14 -; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi -; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r15 -; FALLBACK14-NEXT: shlxq %rax, %r15, %r15 -; FALLBACK14-NEXT: orq %r14, %r15 -; FALLBACK14-NEXT: shrxq %rcx, %r10, %r10 -; FALLBACK14-NEXT: addq %r9, %r9 -; FALLBACK14-NEXT: shlxq %rax, %r9, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: movq %rcx, 56(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %r15, 48(%rdx) -; FALLBACK14-NEXT: movq %rbx, 32(%rdx) -; FALLBACK14-NEXT: movq %r13, 40(%rdx) -; FALLBACK14-NEXT: movq %r11, 16(%rdx) -; FALLBACK14-NEXT: movq %r8, 24(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) -; FALLBACK14-NEXT: popq %rbx -; FALLBACK14-NEXT: popq %r12 -; FALLBACK14-NEXT: popq %r13 -; FALLBACK14-NEXT: popq %r14 -; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: lshr_64bytes: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: pushq %r15 -; FALLBACK15-NEXT: pushq %r14 -; FALLBACK15-NEXT: pushq %rbx -; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK15-NEXT: movl (%rsi), %eax -; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: andl $56, %ecx -; FALLBACK15-NEXT: andl $56, %eax -; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq %r9, %rsi -; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK15-NEXT: movq %r10, %r8 -; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK15-NEXT: movq %r11, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r15 -; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10 -; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK15-NEXT: movq %r15, 8(%rdx) -; FALLBACK15-NEXT: movq %r9, 48(%rdx) -; FALLBACK15-NEXT: movq %rdi, 32(%rdx) -; FALLBACK15-NEXT: movq %rbx, 40(%rdx) -; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rsi, 24(%rdx) -; FALLBACK15-NEXT: movq %r14, (%rdx) -; FALLBACK15-NEXT: movq %r10, 56(%rdx) -; FALLBACK15-NEXT: popq %rbx -; FALLBACK15-NEXT: popq %r14 -; FALLBACK15-NEXT: popq %r15 -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq -; -; FALLBACK16-LABEL: lshr_64bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $204, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl (%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 16(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 20(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 24(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 28(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 40(%eax), %ebp -; FALLBACK16-NEXT: movl 44(%eax), %ebx -; FALLBACK16-NEXT: movl 48(%eax), %edi -; FALLBACK16-NEXT: movl 52(%eax), %esi -; FALLBACK16-NEXT: movl 56(%eax), %edx -; FALLBACK16-NEXT: movl 60(%eax), %ecx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl (%eax), %eax -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %eax, %esi -; FALLBACK16-NEXT: andl $60, %esi -; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK16-NEXT: shll $3, %eax -; FALLBACK16-NEXT: andl $24, %eax -; FALLBACK16-NEXT: movl %edx, %edi -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK16-NEXT: movb %al, %ch -; FALLBACK16-NEXT: notb %ch -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %edx, %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %edi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK16-NEXT: movl %edx, %ebp -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %ebp, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: addl %edx, %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: movl %eax, %edx -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: addl %eax, %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebp, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebx, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebp, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %ebx, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %eax, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK16-NEXT: leal (%edx,%edx), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebp, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebx, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %ebp, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %edx, %edi -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK16-NEXT: movl %esi, %ebx -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK16-NEXT: leal (%eax,%eax), %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: addl %esi, %esi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %ebx, %esi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %eax, %edx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %ebx, 60(%eax) -; FALLBACK16-NEXT: movl %edx, 56(%eax) -; FALLBACK16-NEXT: movl %esi, 48(%eax) -; FALLBACK16-NEXT: movl %ebp, 52(%eax) -; FALLBACK16-NEXT: movl %edi, 40(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 44(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 32(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 36(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 24(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 28(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 16(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 20(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 8(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 12(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, (%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $204, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: lshr_64bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $188, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 12(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 16(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 20(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 24(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 28(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 36(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%ecx), %ebp -; FALLBACK17-NEXT: movl 44(%ecx), %ebx -; FALLBACK17-NEXT: movl 48(%ecx), %edi -; FALLBACK17-NEXT: movl 52(%ecx), %esi -; FALLBACK17-NEXT: movl 56(%ecx), %edx -; FALLBACK17-NEXT: movl 60(%ecx), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %ecx -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ecx, %ebp -; FALLBACK17-NEXT: andl $60, %ebp -; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shll $3, %ecx -; FALLBACK17-NEXT: andl $24, %ecx -; FALLBACK17-NEXT: shrdl %cl, %edx, %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %esi -; FALLBACK17-NEXT: shrdl %cl, %edi, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %edi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl %esi, %edx -; FALLBACK17-NEXT: shrdl %cl, %eax, %edi -; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %esi, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %edx, %edi -; FALLBACK17-NEXT: shrdl %cl, %eax, %esi -; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 56(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK17-NEXT: shrl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 60(%ebp) -; FALLBACK17-NEXT: movl %esi, 48(%ebp) -; FALLBACK17-NEXT: movl %edi, 52(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 40(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 44(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 32(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 36(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 24(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 16(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %ebx, (%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) -; FALLBACK17-NEXT: addl $188, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: lshr_64bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $204, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 12(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 20(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 28(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 36(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 44(%eax), %ebp -; FALLBACK18-NEXT: movl 48(%eax), %edi -; FALLBACK18-NEXT: movl 52(%eax), %esi -; FALLBACK18-NEXT: movl 56(%eax), %edx -; FALLBACK18-NEXT: movl 60(%eax), %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ebx -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: leal (,%ebx,8), %edx -; FALLBACK18-NEXT: andl $24, %edx -; FALLBACK18-NEXT: movl %edx, %ecx -; FALLBACK18-NEXT: andl $60, %ebx -; FALLBACK18-NEXT: movl 68(%esp,%ebx), %esi -; FALLBACK18-NEXT: movl 72(%esp,%ebx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, %esi, %edi -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%eax,%eax), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%ebx), %esi -; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %eax -; FALLBACK18-NEXT: movl 76(%esp,%ebx), %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 88(%esp,%ebx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %eax -; FALLBACK18-NEXT: movl 84(%esp,%ebx), %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 96(%esp,%ebx), %esi -; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %eax -; FALLBACK18-NEXT: movl 92(%esp,%ebx), %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 104(%esp,%ebx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %eax -; FALLBACK18-NEXT: movl 100(%esp,%ebx), %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 112(%esp,%ebx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: movl 108(%esp,%ebx), %esi -; FALLBACK18-NEXT: shrxl %ecx, %esi, %edi -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %ecx, %ebp -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ecx -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 120(%esp,%ebx), %edi -; FALLBACK18-NEXT: leal (%edi,%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK18-NEXT: movl 116(%esp,%ebx), %eax -; FALLBACK18-NEXT: movl %ebp, %ecx -; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebp -; FALLBACK18-NEXT: orl %ebp, %esi -; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %ecx, %ebp -; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK18-NEXT: movl 124(%esp,%ebx), %eax -; FALLBACK18-NEXT: leal (%eax,%eax), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %edx -; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi -; FALLBACK18-NEXT: orl %edi, %edx -; FALLBACK18-NEXT: shrxl %ebp, %eax, %edi -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %edi, 60(%eax) -; FALLBACK18-NEXT: movl %edx, 56(%eax) -; FALLBACK18-NEXT: movl %ecx, 48(%eax) -; FALLBACK18-NEXT: movl %esi, 52(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 40(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 44(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 32(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 36(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 24(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 28(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) -; FALLBACK18-NEXT: addl $204, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: lshr_64bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $188, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 12(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 16(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 20(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 24(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 28(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 36(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%ecx), %ebp -; FALLBACK19-NEXT: movl 44(%ecx), %ebx -; FALLBACK19-NEXT: movl 48(%ecx), %edi -; FALLBACK19-NEXT: movl 52(%ecx), %esi -; FALLBACK19-NEXT: movl 56(%ecx), %edx -; FALLBACK19-NEXT: movl 60(%ecx), %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %ecx -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ecx, %ebp -; FALLBACK19-NEXT: andl $60, %ebp -; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shll $3, %ecx -; FALLBACK19-NEXT: andl $24, %ecx -; FALLBACK19-NEXT: shrdl %cl, %edx, %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %esi -; FALLBACK19-NEXT: shrdl %cl, %edi, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %esi, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %edi, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %edi -; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %esi, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl %edi, %edx -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %edi, %esi -; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl %eax, 56(%ebp) -; FALLBACK19-NEXT: movl %esi, 48(%ebp) -; FALLBACK19-NEXT: movl %edx, 52(%ebp) -; FALLBACK19-NEXT: movl %ebx, 40(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 44(%ebp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 32(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 36(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 24(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 28(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 16(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 20(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 8(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: shrdl %cl, %edx, %edi -; FALLBACK19-NEXT: movl %edi, (%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 4(%ebp) -; FALLBACK19-NEXT: movl %eax, 60(%ebp) -; FALLBACK19-NEXT: addl $188, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: lshr_64bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $204, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK20-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK20-NEXT: movl (%eax), %eax -; FALLBACK20-NEXT: xorps %xmm4, %xmm4 -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %eax, %esi -; FALLBACK20-NEXT: andl $60, %esi -; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK20-NEXT: shll $3, %eax -; FALLBACK20-NEXT: andl $24, %eax -; FALLBACK20-NEXT: movl %edx, %edi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movb %al, %ch -; FALLBACK20-NEXT: notb %ch -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %edx, %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %edi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK20-NEXT: movl %edx, %ebp -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %edx, %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: addl %eax, %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK20-NEXT: leal (%edx,%edx), %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK20-NEXT: movl %edi, %ebp -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: addl %edi, %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %edx, %edi -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK20-NEXT: movl %esi, %ebx -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK20-NEXT: leal (%eax,%eax), %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %eax, %edx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl %ebx, 60(%eax) -; FALLBACK20-NEXT: movl %edx, 56(%eax) -; FALLBACK20-NEXT: movl %esi, 48(%eax) -; FALLBACK20-NEXT: movl %ebp, 52(%eax) -; FALLBACK20-NEXT: movl %edi, 40(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 44(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 32(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 36(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 24(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 28(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 16(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 20(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 8(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 12(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, (%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 4(%eax) -; FALLBACK20-NEXT: addl $204, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: lshr_64bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $188, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movups (%ecx), %xmm0 -; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK21-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK21-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK21-NEXT: movl (%eax), %ecx -; FALLBACK21-NEXT: xorps %xmm4, %xmm4 -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ecx, %ebp -; FALLBACK21-NEXT: andl $60, %ebp -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shll $3, %ecx -; FALLBACK21-NEXT: andl $24, %ecx -; FALLBACK21-NEXT: shrdl %cl, %edx, %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %esi -; FALLBACK21-NEXT: shrdl %cl, %edi, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %edi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl %esi, %edx -; FALLBACK21-NEXT: shrdl %cl, %eax, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %esi, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %edx, %edi -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %edx, 56(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK21-NEXT: shrl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 60(%ebp) -; FALLBACK21-NEXT: movl %esi, 48(%ebp) -; FALLBACK21-NEXT: movl %edi, 52(%ebp) -; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 40(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 44(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 32(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 36(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 24(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %ebx, (%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 4(%ebp) -; FALLBACK21-NEXT: addl $188, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: lshr_64bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $204, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK22-NEXT: movl (%eax), %ebx -; FALLBACK22-NEXT: xorps %xmm4, %xmm4 -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%ebx,8), %edx -; FALLBACK22-NEXT: andl $24, %edx -; FALLBACK22-NEXT: movl %edx, %ecx -; FALLBACK22-NEXT: andl $60, %ebx -; FALLBACK22-NEXT: movl 68(%esp,%ebx), %esi -; FALLBACK22-NEXT: movl 72(%esp,%ebx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi -; FALLBACK22-NEXT: notb %dl -; FALLBACK22-NEXT: leal (%eax,%eax), %ebp -; FALLBACK22-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK22-NEXT: orl %edi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %eax -; FALLBACK22-NEXT: orl %edi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 80(%esp,%ebx), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %eax -; FALLBACK22-NEXT: movl 76(%esp,%ebx), %edi -; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %edi -; FALLBACK22-NEXT: orl %eax, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 88(%esp,%ebx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %eax -; FALLBACK22-NEXT: movl 84(%esp,%ebx), %edi -; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ecx, %esi, %esi -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 96(%esp,%ebx), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %eax -; FALLBACK22-NEXT: movl 92(%esp,%ebx), %edi -; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %edi -; FALLBACK22-NEXT: orl %eax, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 104(%esp,%ebx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %eax -; FALLBACK22-NEXT: movl 100(%esp,%ebx), %edi -; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ecx, %esi, %esi -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 112(%esp,%ebx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %eax -; FALLBACK22-NEXT: movl 108(%esp,%ebx), %esi -; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi -; FALLBACK22-NEXT: orl %edi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %ecx, %ebp -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ecx -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 120(%esp,%ebx), %edi -; FALLBACK22-NEXT: leal (%edi,%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK22-NEXT: movl 116(%esp,%ebx), %eax -; FALLBACK22-NEXT: movl %ebp, %ecx -; FALLBACK22-NEXT: shrxl %ebp, %eax, %ebp -; FALLBACK22-NEXT: orl %ebp, %esi -; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %ebp -; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK22-NEXT: movl 124(%esp,%ebx), %eax -; FALLBACK22-NEXT: leal (%eax,%eax), %ebx -; FALLBACK22-NEXT: shlxl %edx, %ebx, %edx -; FALLBACK22-NEXT: shrxl %ebp, %edi, %edi -; FALLBACK22-NEXT: orl %edi, %edx -; FALLBACK22-NEXT: shrxl %ebp, %eax, %edi -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edi, 60(%eax) -; FALLBACK22-NEXT: movl %edx, 56(%eax) -; FALLBACK22-NEXT: movl %ecx, 48(%eax) -; FALLBACK22-NEXT: movl %esi, 52(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 40(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 44(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 32(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 36(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 24(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 28(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 16(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 20(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 8(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 12(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, (%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 4(%eax) -; FALLBACK22-NEXT: addl $204, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: lshr_64bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $188, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movups (%ecx), %xmm0 -; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK23-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK23-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK23-NEXT: movl (%eax), %ecx -; FALLBACK23-NEXT: xorps %xmm4, %xmm4 -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %ecx, %ebp -; FALLBACK23-NEXT: andl $60, %ebp -; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shll $3, %ecx -; FALLBACK23-NEXT: andl $24, %ecx -; FALLBACK23-NEXT: shrdl %cl, %edx, %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %esi -; FALLBACK23-NEXT: shrdl %cl, %edi, %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %edi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl %edi, %edx -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %edi, %esi -; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK23-NEXT: movl %eax, 56(%ebp) -; FALLBACK23-NEXT: movl %esi, 48(%ebp) -; FALLBACK23-NEXT: movl %edx, 52(%ebp) -; FALLBACK23-NEXT: movl %ebx, 40(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 44(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 32(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 36(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 24(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 28(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 16(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 20(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 8(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 12(%ebp) -; FALLBACK23-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %edx, %edi -; FALLBACK23-NEXT: movl %edi, (%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 4(%ebp) -; FALLBACK23-NEXT: movl %eax, 60(%ebp) -; FALLBACK23-NEXT: addl $188, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: lshr_64bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $204, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK24-NEXT: movl (%eax), %ecx -; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, %esi -; FALLBACK24-NEXT: andl $60, %esi -; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK24-NEXT: shll $3, %ecx -; FALLBACK24-NEXT: andl $24, %ecx -; FALLBACK24-NEXT: movl %edx, %edi -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: movl 72(%esp,%esi), %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%eax,%eax), %ebx -; FALLBACK24-NEXT: movl %ecx, %ebp -; FALLBACK24-NEXT: movb %cl, %ch -; FALLBACK24-NEXT: notb %ch -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK24-NEXT: movl %ebp, %eax -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %edx, %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %edi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK24-NEXT: movl %edx, %ebp -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %edx, %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: addl %eax, %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK24-NEXT: leal (%edx,%edx), %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK24-NEXT: movl %edi, %ebp -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: addl %edi, %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %edx, %edi -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK24-NEXT: movl %esi, %ebx -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK24-NEXT: leal (%eax,%eax), %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %eax, %edx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl %ebx, 60(%eax) -; FALLBACK24-NEXT: movl %edx, 56(%eax) -; FALLBACK24-NEXT: movl %esi, 48(%eax) -; FALLBACK24-NEXT: movl %ebp, 52(%eax) -; FALLBACK24-NEXT: movl %edi, 40(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 44(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 32(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 36(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 24(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 28(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 16(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 20(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 8(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 12(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, (%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 4(%eax) -; FALLBACK24-NEXT: addl $204, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: vzeroupper -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: lshr_64bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $188, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK25-NEXT: movl (%eax), %ecx -; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ecx, %ebp -; FALLBACK25-NEXT: andl $60, %ebp -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shll $3, %ecx -; FALLBACK25-NEXT: andl $24, %ecx -; FALLBACK25-NEXT: shrdl %cl, %edx, %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %esi -; FALLBACK25-NEXT: shrdl %cl, %edi, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %edi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl %esi, %edx -; FALLBACK25-NEXT: shrdl %cl, %eax, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %esi, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %edx, %edi -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %edx, 56(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK25-NEXT: shrl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 60(%ebp) -; FALLBACK25-NEXT: movl %esi, 48(%ebp) -; FALLBACK25-NEXT: movl %edi, 52(%ebp) -; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 40(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 44(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 32(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 36(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 24(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %ebx, (%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 4(%ebp) -; FALLBACK25-NEXT: addl $188, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: vzeroupper -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: lshr_64bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $204, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK26-NEXT: movl (%eax), %ecx -; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: leal (,%ecx,8), %edx -; FALLBACK26-NEXT: andl $24, %edx -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: andl $60, %ecx -; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi -; FALLBACK26-NEXT: notb %dl -; FALLBACK26-NEXT: leal (%eax,%eax), %ebp -; FALLBACK26-NEXT: shlxl %edx, %ebp, %ebp -; FALLBACK26-NEXT: orl %edi, %ebp -; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %esi -; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %eax -; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %edi -; FALLBACK26-NEXT: orl %eax, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %eax -; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %eax -; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %edi -; FALLBACK26-NEXT: orl %eax, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %eax -; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %eax -; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi -; FALLBACK26-NEXT: orl %edi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %eax, %ebp -; FALLBACK26-NEXT: movl 120(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %edx, %eax, %esi -; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %edi -; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %edx, %eax, %eax -; FALLBACK26-NEXT: orl %edi, %eax -; FALLBACK26-NEXT: movl 124(%esp,%ecx), %ecx -; FALLBACK26-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %edx -; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: orl %edi, %edx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %edi -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: movl %edi, 60(%ecx) -; FALLBACK26-NEXT: movl %edx, 56(%ecx) -; FALLBACK26-NEXT: movl %eax, 48(%ecx) -; FALLBACK26-NEXT: movl %esi, 52(%ecx) -; FALLBACK26-NEXT: movl %ebp, 40(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 44(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 32(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 36(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 24(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 28(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 16(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 8(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, (%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 4(%ecx) -; FALLBACK26-NEXT: addl $204, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: vzeroupper -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: lshr_64bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $188, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK27-NEXT: movl (%eax), %ecx -; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %ecx, %ebp -; FALLBACK27-NEXT: andl $60, %ebp -; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shll $3, %ecx -; FALLBACK27-NEXT: andl $24, %ecx -; FALLBACK27-NEXT: shrdl %cl, %edx, %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %esi -; FALLBACK27-NEXT: shrdl %cl, %edi, %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %edi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl %edi, %edx -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %edi, %esi -; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK27-NEXT: movl %eax, 56(%ebp) -; FALLBACK27-NEXT: movl %esi, 48(%ebp) -; FALLBACK27-NEXT: movl %edx, 52(%ebp) -; FALLBACK27-NEXT: movl %ebx, 40(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 44(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 32(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 36(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 24(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 28(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 16(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 20(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 8(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 12(%ebp) -; FALLBACK27-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %edx, %edi -; FALLBACK27-NEXT: movl %edi, (%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 4(%ebp) -; FALLBACK27-NEXT: movl %eax, 60(%ebp) -; FALLBACK27-NEXT: addl $188, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: vzeroupper -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: lshr_64bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $204, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK28-NEXT: movl (%eax), %ecx -; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, %esi -; FALLBACK28-NEXT: andl $60, %esi -; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK28-NEXT: shll $3, %ecx -; FALLBACK28-NEXT: andl $24, %ecx -; FALLBACK28-NEXT: movl %edx, %edi -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: movl 72(%esp,%esi), %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%eax,%eax), %ebx -; FALLBACK28-NEXT: movl %ecx, %ebp -; FALLBACK28-NEXT: movb %cl, %ch -; FALLBACK28-NEXT: notb %ch -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK28-NEXT: movl %ebp, %eax -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %edx, %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %edi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK28-NEXT: movl %edx, %ebp -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %edx, %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: addl %eax, %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK28-NEXT: leal (%edx,%edx), %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK28-NEXT: movl %edi, %ebp -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: addl %edi, %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %edx, %edi -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK28-NEXT: movl %esi, %ebx -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK28-NEXT: leal (%eax,%eax), %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %eax, %edx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl %ebx, 60(%eax) -; FALLBACK28-NEXT: movl %edx, 56(%eax) -; FALLBACK28-NEXT: movl %esi, 48(%eax) -; FALLBACK28-NEXT: movl %ebp, 52(%eax) -; FALLBACK28-NEXT: movl %edi, 40(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 44(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 32(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 36(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 24(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 28(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 16(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 20(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 8(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 12(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, (%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 4(%eax) -; FALLBACK28-NEXT: addl $204, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: vzeroupper -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: lshr_64bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $188, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK29-NEXT: movl (%eax), %ecx -; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ecx, %ebp -; FALLBACK29-NEXT: andl $60, %ebp -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shll $3, %ecx -; FALLBACK29-NEXT: andl $24, %ecx -; FALLBACK29-NEXT: shrdl %cl, %edx, %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %esi -; FALLBACK29-NEXT: shrdl %cl, %edi, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %edi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl %esi, %edx -; FALLBACK29-NEXT: shrdl %cl, %eax, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %esi, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %edx, %edi -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %edx, 56(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK29-NEXT: shrl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 60(%ebp) -; FALLBACK29-NEXT: movl %esi, 48(%ebp) -; FALLBACK29-NEXT: movl %edi, 52(%ebp) -; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 40(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 44(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 32(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 36(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 24(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %ebx, (%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 4(%ebp) -; FALLBACK29-NEXT: addl $188, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: vzeroupper -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: lshr_64bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $204, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK30-NEXT: movl (%eax), %ecx -; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: leal (,%ecx,8), %edx -; FALLBACK30-NEXT: andl $24, %edx -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: andl $60, %ecx -; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl 72(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi -; FALLBACK30-NEXT: notb %dl -; FALLBACK30-NEXT: leal (%eax,%eax), %ebp -; FALLBACK30-NEXT: shlxl %edx, %ebp, %ebp -; FALLBACK30-NEXT: orl %edi, %ebp -; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %esi -; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %eax -; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %edi -; FALLBACK30-NEXT: orl %eax, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %eax -; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %eax -; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %edi -; FALLBACK30-NEXT: orl %eax, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %eax -; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %eax -; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi -; FALLBACK30-NEXT: orl %edi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %eax, %ebp -; FALLBACK30-NEXT: movl 120(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %edx, %eax, %esi -; FALLBACK30-NEXT: movl 116(%esp,%ecx), %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %edi -; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %edx, %eax, %eax -; FALLBACK30-NEXT: orl %edi, %eax -; FALLBACK30-NEXT: movl 124(%esp,%ecx), %ecx -; FALLBACK30-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %edx -; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: orl %edi, %edx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %edi -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: movl %edi, 60(%ecx) -; FALLBACK30-NEXT: movl %edx, 56(%ecx) -; FALLBACK30-NEXT: movl %eax, 48(%ecx) -; FALLBACK30-NEXT: movl %esi, 52(%ecx) -; FALLBACK30-NEXT: movl %ebp, 40(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 44(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 32(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 36(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 24(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 28(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 16(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 8(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, (%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 4(%ecx) -; FALLBACK30-NEXT: addl $204, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: vzeroupper -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: lshr_64bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $188, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK31-NEXT: movl (%eax), %ecx -; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %ecx, %ebp -; FALLBACK31-NEXT: andl $60, %ebp -; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shll $3, %ecx -; FALLBACK31-NEXT: andl $24, %ecx -; FALLBACK31-NEXT: shrdl %cl, %edx, %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %esi -; FALLBACK31-NEXT: shrdl %cl, %edi, %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %edi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl %edi, %edx -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %edi, %esi -; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK31-NEXT: movl %eax, 56(%ebp) -; FALLBACK31-NEXT: movl %esi, 48(%ebp) -; FALLBACK31-NEXT: movl %edx, 52(%ebp) -; FALLBACK31-NEXT: movl %ebx, 40(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 44(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 32(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 36(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 24(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 28(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 16(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 20(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 8(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 12(%ebp) -; FALLBACK31-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %edx, %edi -; FALLBACK31-NEXT: movl %edi, (%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 4(%ebp) -; FALLBACK31-NEXT: movl %eax, 60(%ebp) -; FALLBACK31-NEXT: addl $188, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: vzeroupper -; FALLBACK31-NEXT: retl +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %edi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rdi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %edi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %r8, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r15, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rdi), %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r13,%r13), %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r12, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r14, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rdi,%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r13, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r10, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r15,%r15), %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r12, %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r15, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rax,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r14, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r12, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %r8d +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%r8,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %r8d +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%r8), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%r8), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%r8), %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%r8), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r14, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%r8), %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%r8), %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%rbp,%rbp), %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r13, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r14, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r12, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%r8), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbp, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq $8, %rsp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r14,%r14), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rbx, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r13, %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r15, %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r14, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r12, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rax,%rax), %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r15, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r9, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r13, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r11, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes: +; X64-NO-SHLD-NO-BMI2-AVX1: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbp +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rax +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %r9d +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (,%r9,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %r9d +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -128(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -120(%rsp,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -104(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -96(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r12,%r12), %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -112(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r14, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -88(%rsp,%r9), %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -80(%rsp,%r9), %rbp +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%rbp,%rbp), %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r13, %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %r14, %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r12, %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbp +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -72(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r9,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbp, %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r8, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq $8, %rsp +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbp +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -120(%rsp,%rsi), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -112(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -104(%rsp,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -96(%rsp,%rsi), %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r14,%r14), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r9, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -88(%rsp,%rsi), %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %rbx, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -80(%rsp,%rsi), %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r12,%r12), %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r13, %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r15, %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r14, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r12, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -72(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%rsi,%rsi), %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r15, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r10, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rcx, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r13, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r11, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes: +; X64-NO-SHLD-NO-BMI2-AVX512: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbp +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rax +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %r9d +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (,%r9,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %r9d +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -128(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -120(%rsp,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -104(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -96(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r12,%r12), %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -112(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r14, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -88(%rsp,%r9), %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -80(%rsp,%r9), %rbp +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%rbp,%rbp), %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r13, %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %r14, %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r12, %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbp +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -72(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r9,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbp, %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r8, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq $8, %rsp +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbp +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %edi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rdi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %edi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -96(%rsp,%rdi), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -104(%rsp,%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %rsi, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -112(%rsp,%rdi), %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -80(%rsp,%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -88(%rsp,%rdi), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r11, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -72(%rsp,%rdi), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -128(%rsp,%rdi), %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -120(%rsp,%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %rdi, %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsi, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -120(%rsp,%rsi), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -112(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -104(%rsp,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -96(%rsp,%rsi), %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r14,%r14), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r9, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -88(%rsp,%rsi), %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %rbx, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -80(%rsp,%rsi), %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r12,%r12), %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r13, %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r15, %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r14, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r12, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -72(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%rsi,%rsi), %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r15, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r10, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rcx, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r13, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r11, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retq +; +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%eax), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%eax), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 96(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 100(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 104(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 108(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 112(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 116(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 120(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%ecx), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%ecx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%ecx), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%ecx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%ebx,8), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 96(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 92(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 104(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 100(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 112(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 108(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 120(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 116(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 124(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 52(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%ecx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%ecx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 96(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 100(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 104(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 108(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 112(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 116(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 120(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%ebx,8), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 96(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 92(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 104(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 100(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 112(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 108(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 120(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%edi,%edi), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 116(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 124(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 52(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes: +; X86-NO-SHLD-NO-BMI2-AVX1: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 72(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %cl, %ch +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 64(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 76(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edi,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 84(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 92(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 96(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 100(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 104(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 108(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 112(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 116(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 120(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%ecx,8), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 68(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 72(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 80(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 76(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 88(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 84(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 96(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 92(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 104(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 100(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 112(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 108(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 120(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 116(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 124(%esp,%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, 60(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 56(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 48(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 52(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, 40(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 44(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 32(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 36(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 24(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 16(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 20(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, (%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 4(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes: +; X86-NO-SHLD-NO-BMI2-AVX512: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 72(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %cl, %ch +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 64(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 76(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edi,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 84(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 92(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 96(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 100(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 104(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 108(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 112(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 116(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 120(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%ecx,8), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 68(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 72(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 80(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 76(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 88(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 84(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 96(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 92(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 104(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 100(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 112(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 108(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 120(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 116(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 124(%esp,%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 60(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 56(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 48(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 52(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, 40(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, (%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 4(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3 @@ -15993,3774 +13714,3774 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no } define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: shl_64bytes: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %r15 -; FALLBACK0-NEXT: pushq %r14 -; FALLBACK0-NEXT: pushq %r13 -; FALLBACK0-NEXT: pushq %r12 -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rax -; FALLBACK0-NEXT: movq 8(%rdi), %rcx -; FALLBACK0-NEXT: movq 16(%rdi), %r8 -; FALLBACK0-NEXT: movq 24(%rdi), %r9 -; FALLBACK0-NEXT: movq 32(%rdi), %r10 -; FALLBACK0-NEXT: movq 40(%rdi), %r11 -; FALLBACK0-NEXT: movq 48(%rdi), %rbx -; FALLBACK0-NEXT: movq 56(%rdi), %rdi -; FALLBACK0-NEXT: movl (%rsi), %esi -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: leal (,%rsi,8), %eax -; FALLBACK0-NEXT: andl $56, %eax -; FALLBACK0-NEXT: andl $56, %esi -; FALLBACK0-NEXT: negl %esi -; FALLBACK0-NEXT: movslq %esi, %rbx -; FALLBACK0-NEXT: movq -64(%rsp,%rbx), %r8 -; FALLBACK0-NEXT: movq -56(%rsp,%rbx), %rdi -; FALLBACK0-NEXT: movq %rdi, %r10 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq %r8, %r9 -; FALLBACK0-NEXT: shrq %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: orq %r10, %r9 -; FALLBACK0-NEXT: movq -40(%rsp,%rbx), %r10 -; FALLBACK0-NEXT: movq %r10, %r14 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r14 -; FALLBACK0-NEXT: movq -48(%rsp,%rbx), %r15 -; FALLBACK0-NEXT: movq %r15, %r11 -; FALLBACK0-NEXT: shrq %r11 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: orq %r14, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r15 -; FALLBACK0-NEXT: shrq %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rdi -; FALLBACK0-NEXT: orq %r15, %rdi -; FALLBACK0-NEXT: movq -24(%rsp,%rbx), %r14 -; FALLBACK0-NEXT: movq %r14, %r12 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r12 -; FALLBACK0-NEXT: movq -32(%rsp,%rbx), %r13 -; FALLBACK0-NEXT: movq %r13, %r15 -; FALLBACK0-NEXT: shrq %r15 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r15 -; FALLBACK0-NEXT: orq %r12, %r15 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r13 -; FALLBACK0-NEXT: shrq %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: orq %r13, %r10 -; FALLBACK0-NEXT: movq -8(%rsp,%rbx), %r12 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r12 -; FALLBACK0-NEXT: movq -16(%rsp,%rbx), %rbx -; FALLBACK0-NEXT: movq %rbx, %r13 -; FALLBACK0-NEXT: shrq %r13 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r13 -; FALLBACK0-NEXT: orq %r12, %r13 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %rbx -; FALLBACK0-NEXT: shrq %r14 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r14 -; FALLBACK0-NEXT: orq %rbx, %r14 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %r14, 48(%rdx) -; FALLBACK0-NEXT: movq %r13, 56(%rdx) -; FALLBACK0-NEXT: movq %r10, 32(%rdx) -; FALLBACK0-NEXT: movq %r15, 40(%rdx) -; FALLBACK0-NEXT: movq %rdi, 16(%rdx) -; FALLBACK0-NEXT: movq %r11, 24(%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: popq %r12 -; FALLBACK0-NEXT: popq %r13 -; FALLBACK0-NEXT: popq %r14 -; FALLBACK0-NEXT: popq %r15 -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: shl_64bytes: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: pushq %r14 -; FALLBACK1-NEXT: pushq %rbx -; FALLBACK1-NEXT: pushq %rax -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %rcx -; FALLBACK1-NEXT: movq 16(%rdi), %r8 -; FALLBACK1-NEXT: movq 24(%rdi), %r9 -; FALLBACK1-NEXT: movq 32(%rdi), %r10 -; FALLBACK1-NEXT: movq 40(%rdi), %r11 -; FALLBACK1-NEXT: movq 48(%rdi), %rbx -; FALLBACK1-NEXT: movq 56(%rdi), %rdi -; FALLBACK1-NEXT: movl (%rsi), %esi -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: leal (,%rsi,8), %ecx -; FALLBACK1-NEXT: andl $56, %ecx -; FALLBACK1-NEXT: andl $56, %esi -; FALLBACK1-NEXT: negl %esi -; FALLBACK1-NEXT: movslq %esi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%r9), %rax -; FALLBACK1-NEXT: movq -40(%rsp,%r9), %r10 -; FALLBACK1-NEXT: movq %r10, %rsi -; FALLBACK1-NEXT: shldq %cl, %rax, %rsi -; FALLBACK1-NEXT: movq -64(%rsp,%r9), %r8 -; FALLBACK1-NEXT: movq -56(%rsp,%r9), %rdi -; FALLBACK1-NEXT: shldq %cl, %rdi, %rax -; FALLBACK1-NEXT: movq -32(%rsp,%r9), %r11 -; FALLBACK1-NEXT: movq -24(%rsp,%r9), %rbx -; FALLBACK1-NEXT: movq %rbx, %r14 -; FALLBACK1-NEXT: shldq %cl, %r11, %r14 -; FALLBACK1-NEXT: shldq %cl, %r10, %r11 -; FALLBACK1-NEXT: movq -16(%rsp,%r9), %r10 -; FALLBACK1-NEXT: movq -8(%rsp,%r9), %r9 -; FALLBACK1-NEXT: shldq %cl, %r10, %r9 -; FALLBACK1-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK1-NEXT: shldq %cl, %r8, %rdi -; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: shlq %cl, %r8 -; FALLBACK1-NEXT: movq %r10, 48(%rdx) -; FALLBACK1-NEXT: movq %r9, 56(%rdx) -; FALLBACK1-NEXT: movq %r11, 32(%rdx) -; FALLBACK1-NEXT: movq %r14, 40(%rdx) -; FALLBACK1-NEXT: movq %rax, 16(%rdx) -; FALLBACK1-NEXT: movq %rsi, 24(%rdx) -; FALLBACK1-NEXT: movq %r8, (%rdx) -; FALLBACK1-NEXT: movq %rdi, 8(%rdx) -; FALLBACK1-NEXT: addq $8, %rsp -; FALLBACK1-NEXT: popq %rbx -; FALLBACK1-NEXT: popq %r14 -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: shl_64bytes: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %r15 -; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r12 -; FALLBACK2-NEXT: pushq %rbx -; FALLBACK2-NEXT: pushq %rax -; FALLBACK2-NEXT: movq (%rdi), %rax -; FALLBACK2-NEXT: movq 8(%rdi), %rcx -; FALLBACK2-NEXT: movq 16(%rdi), %r8 -; FALLBACK2-NEXT: movq 24(%rdi), %r9 -; FALLBACK2-NEXT: movq 32(%rdi), %r10 -; FALLBACK2-NEXT: movq 40(%rdi), %r11 -; FALLBACK2-NEXT: movq 48(%rdi), %rbx -; FALLBACK2-NEXT: movq 56(%rdi), %rdi -; FALLBACK2-NEXT: movl (%rsi), %esi -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: leal (,%rsi,8), %eax -; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movl %eax, %ecx -; FALLBACK2-NEXT: andl $56, %esi -; FALLBACK2-NEXT: negl %esi -; FALLBACK2-NEXT: movslq %esi, %rsi -; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: shlxq %rcx, %rdi, %r8 -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: shlxq %rcx, %r9, %r10 -; FALLBACK2-NEXT: shrq %r9 -; FALLBACK2-NEXT: shrxq %rax, %r9, %r9 -; FALLBACK2-NEXT: orq %r8, %r9 -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %r11 -; FALLBACK2-NEXT: shlxq %rcx, %r11, %rbx -; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r8 -; FALLBACK2-NEXT: shlxq %rcx, %r8, %r14 -; FALLBACK2-NEXT: shrq %r8 -; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 -; FALLBACK2-NEXT: orq %rbx, %r8 -; FALLBACK2-NEXT: shrq %rdi -; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r14, %rdi -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rbx -; FALLBACK2-NEXT: shlxq %rcx, %rbx, %r14 -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r15 -; FALLBACK2-NEXT: shlxq %rcx, %r15, %r12 -; FALLBACK2-NEXT: shrq %r15 -; FALLBACK2-NEXT: shrxq %rax, %r15, %r15 -; FALLBACK2-NEXT: orq %r14, %r15 -; FALLBACK2-NEXT: shrq %r11 -; FALLBACK2-NEXT: shrxq %rax, %r11, %r11 -; FALLBACK2-NEXT: orq %r12, %r11 -; FALLBACK2-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14 -; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r14, %rsi -; FALLBACK2-NEXT: shrq %rbx -; FALLBACK2-NEXT: shrxq %rax, %rbx, %rax -; FALLBACK2-NEXT: orq %rcx, %rax -; FALLBACK2-NEXT: movq %r10, (%rdx) -; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %rsi, 56(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r15, 40(%rdx) -; FALLBACK2-NEXT: movq %rdi, 16(%rdx) -; FALLBACK2-NEXT: movq %r8, 24(%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) -; FALLBACK2-NEXT: addq $8, %rsp -; FALLBACK2-NEXT: popq %rbx -; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r14 -; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: shl_64bytes: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: pushq %r14 -; FALLBACK3-NEXT: pushq %rbx -; FALLBACK3-NEXT: pushq %rax -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %rcx -; FALLBACK3-NEXT: movq 16(%rdi), %r8 -; FALLBACK3-NEXT: movq 24(%rdi), %r9 -; FALLBACK3-NEXT: movq 32(%rdi), %r10 -; FALLBACK3-NEXT: movq 40(%rdi), %r11 -; FALLBACK3-NEXT: movq 48(%rdi), %rbx -; FALLBACK3-NEXT: movq 56(%rdi), %rdi -; FALLBACK3-NEXT: movl (%rsi), %esi -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: leal (,%rsi,8), %ecx -; FALLBACK3-NEXT: andl $56, %ecx -; FALLBACK3-NEXT: andl $56, %esi -; FALLBACK3-NEXT: negl %esi -; FALLBACK3-NEXT: movslq %esi, %r8 -; FALLBACK3-NEXT: movq -48(%rsp,%r8), %rax -; FALLBACK3-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK3-NEXT: movq %r9, %rsi -; FALLBACK3-NEXT: shldq %cl, %rax, %rsi -; FALLBACK3-NEXT: movq -64(%rsp,%r8), %r10 -; FALLBACK3-NEXT: movq -56(%rsp,%r8), %rdi -; FALLBACK3-NEXT: shldq %cl, %rdi, %rax -; FALLBACK3-NEXT: movq -32(%rsp,%r8), %r11 -; FALLBACK3-NEXT: movq -24(%rsp,%r8), %rbx -; FALLBACK3-NEXT: movq %rbx, %r14 -; FALLBACK3-NEXT: shldq %cl, %r11, %r14 -; FALLBACK3-NEXT: shldq %cl, %r9, %r11 -; FALLBACK3-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK3-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK3-NEXT: shldq %cl, %r9, %r8 -; FALLBACK3-NEXT: shldq %cl, %rbx, %r9 -; FALLBACK3-NEXT: shldq %cl, %r10, %rdi -; FALLBACK3-NEXT: shlxq %rcx, %r10, %rcx -; FALLBACK3-NEXT: movq %r9, 48(%rdx) -; FALLBACK3-NEXT: movq %r8, 56(%rdx) -; FALLBACK3-NEXT: movq %r11, 32(%rdx) -; FALLBACK3-NEXT: movq %r14, 40(%rdx) -; FALLBACK3-NEXT: movq %rax, 16(%rdx) -; FALLBACK3-NEXT: movq %rsi, 24(%rdx) -; FALLBACK3-NEXT: movq %rcx, (%rdx) -; FALLBACK3-NEXT: movq %rdi, 8(%rdx) -; FALLBACK3-NEXT: addq $8, %rsp -; FALLBACK3-NEXT: popq %rbx -; FALLBACK3-NEXT: popq %r14 -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: shl_64bytes: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %r15 -; FALLBACK4-NEXT: pushq %r14 -; FALLBACK4-NEXT: pushq %r13 -; FALLBACK4-NEXT: pushq %r12 -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK4-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK4-NEXT: movl (%rsi), %ecx -; FALLBACK4-NEXT: xorps %xmm4, %xmm4 -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: leal (,%rcx,8), %eax -; FALLBACK4-NEXT: andl $56, %eax -; FALLBACK4-NEXT: andl $56, %ecx -; FALLBACK4-NEXT: negl %ecx -; FALLBACK4-NEXT: movslq %ecx, %r9 -; FALLBACK4-NEXT: movq -24(%rsp,%r9), %rdi -; FALLBACK4-NEXT: movq %rdi, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: movq -32(%rsp,%r9), %r11 -; FALLBACK4-NEXT: movq %r11, %r8 -; FALLBACK4-NEXT: shrq %r8 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: orq %r10, %r8 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9), %rbx -; FALLBACK4-NEXT: movq %rbx, %r10 -; FALLBACK4-NEXT: shrq %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: orq %r11, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r15 -; FALLBACK4-NEXT: movq %r15, %r11 -; FALLBACK4-NEXT: shrq %r11 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: orq %rbx, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r15 -; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r14 -; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r12 -; FALLBACK4-NEXT: movq %r12, %rbx -; FALLBACK4-NEXT: shrq %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbx -; FALLBACK4-NEXT: orq %r15, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r12 -; FALLBACK4-NEXT: movq %r14, %r15 -; FALLBACK4-NEXT: shrq %r15 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r15 -; FALLBACK4-NEXT: orq %r12, %r15 -; FALLBACK4-NEXT: movq -16(%rsp,%r9), %r12 -; FALLBACK4-NEXT: movq %r12, %r13 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r13 -; FALLBACK4-NEXT: shrq %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %rdi -; FALLBACK4-NEXT: orq %r13, %rdi -; FALLBACK4-NEXT: movq -8(%rsp,%r9), %r9 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r9 -; FALLBACK4-NEXT: shrq %r12 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r12 -; FALLBACK4-NEXT: orq %r9, %r12 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r14 -; FALLBACK4-NEXT: movq %r14, (%rdx) -; FALLBACK4-NEXT: movq %r12, 56(%rdx) -; FALLBACK4-NEXT: movq %rdi, 48(%rdx) -; FALLBACK4-NEXT: movq %r15, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) -; FALLBACK4-NEXT: movq %r11, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 32(%rdx) -; FALLBACK4-NEXT: movq %r8, 40(%rdx) -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: popq %r12 -; FALLBACK4-NEXT: popq %r13 -; FALLBACK4-NEXT: popq %r14 -; FALLBACK4-NEXT: popq %r15 -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: shl_64bytes: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: pushq %r15 -; FALLBACK5-NEXT: pushq %r14 -; FALLBACK5-NEXT: pushq %rbx -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK5-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK5-NEXT: movl (%rsi), %eax -; FALLBACK5-NEXT: xorps %xmm4, %xmm4 -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: andl $56, %ecx -; FALLBACK5-NEXT: andl $56, %eax -; FALLBACK5-NEXT: negl %eax -; FALLBACK5-NEXT: movslq %eax, %r8 -; FALLBACK5-NEXT: movq -32(%rsp,%r8), %rax -; FALLBACK5-NEXT: movq -24(%rsp,%r8), %r9 -; FALLBACK5-NEXT: movq %r9, %rsi -; FALLBACK5-NEXT: shldq %cl, %rax, %rsi -; FALLBACK5-NEXT: movq -40(%rsp,%r8), %rdi -; FALLBACK5-NEXT: shldq %cl, %rdi, %rax -; FALLBACK5-NEXT: movq -48(%rsp,%r8), %r10 -; FALLBACK5-NEXT: shldq %cl, %r10, %rdi -; FALLBACK5-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK5-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK5-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK5-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK5-NEXT: movq %r14, %r15 -; FALLBACK5-NEXT: shldq %cl, %r9, %r15 -; FALLBACK5-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK5-NEXT: shldq %cl, %r14, %r8 -; FALLBACK5-NEXT: movq %r11, %r9 -; FALLBACK5-NEXT: shlq %cl, %r9 -; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: shldq %cl, %r11, %rbx -; FALLBACK5-NEXT: movq %r8, 56(%rdx) -; FALLBACK5-NEXT: movq %r15, 48(%rdx) -; FALLBACK5-NEXT: movq %rbx, 8(%rdx) -; FALLBACK5-NEXT: movq %r10, 16(%rdx) -; FALLBACK5-NEXT: movq %rdi, 24(%rdx) -; FALLBACK5-NEXT: movq %rax, 32(%rdx) -; FALLBACK5-NEXT: movq %rsi, 40(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: popq %rbx -; FALLBACK5-NEXT: popq %r14 -; FALLBACK5-NEXT: popq %r15 -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: shl_64bytes: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %r15 -; FALLBACK6-NEXT: pushq %r14 -; FALLBACK6-NEXT: pushq %r12 -; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: pushq %rax -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK6-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK6-NEXT: movl (%rsi), %esi -; FALLBACK6-NEXT: xorps %xmm4, %xmm4 -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rsi,8), %eax -; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: movl %eax, %ecx -; FALLBACK6-NEXT: andl $56, %esi -; FALLBACK6-NEXT: negl %esi -; FALLBACK6-NEXT: movslq %esi, %rsi -; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %rdi -; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r9 -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r8 -; FALLBACK6-NEXT: shlxq %rcx, %r8, %r10 -; FALLBACK6-NEXT: shrq %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 -; FALLBACK6-NEXT: orq %r9, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r9 -; FALLBACK6-NEXT: shlxq %rcx, %r9, %r11 -; FALLBACK6-NEXT: shrq %r9 -; FALLBACK6-NEXT: shrxq %rax, %r9, %r9 -; FALLBACK6-NEXT: orq %r10, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %r10 -; FALLBACK6-NEXT: shlxq %rcx, %r10, %r14 -; FALLBACK6-NEXT: shrq %r10 -; FALLBACK6-NEXT: shrxq %rax, %r10, %r10 -; FALLBACK6-NEXT: orq %r11, %r10 -; FALLBACK6-NEXT: movq -64(%rsp,%rsi), %rbx -; FALLBACK6-NEXT: movq -56(%rsp,%rsi), %r11 -; FALLBACK6-NEXT: shlxq %rcx, %r11, %r15 -; FALLBACK6-NEXT: shrq %r11 -; FALLBACK6-NEXT: shrxq %rax, %r11, %r11 -; FALLBACK6-NEXT: orq %r14, %r11 -; FALLBACK6-NEXT: shlxq %rcx, %rbx, %r14 -; FALLBACK6-NEXT: shrq %rbx -; FALLBACK6-NEXT: shrxq %rax, %rbx, %rbx -; FALLBACK6-NEXT: orq %r15, %rbx -; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %r15 -; FALLBACK6-NEXT: shlxq %rcx, %r15, %r12 -; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %r12, %rdi -; FALLBACK6-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx -; FALLBACK6-NEXT: shrq %r15 -; FALLBACK6-NEXT: shrxq %rax, %r15, %rax -; FALLBACK6-NEXT: orq %rcx, %rax -; FALLBACK6-NEXT: movq %r14, (%rdx) -; FALLBACK6-NEXT: movq %rax, 56(%rdx) -; FALLBACK6-NEXT: movq %rdi, 48(%rdx) -; FALLBACK6-NEXT: movq %rbx, 8(%rdx) -; FALLBACK6-NEXT: movq %r11, 16(%rdx) -; FALLBACK6-NEXT: movq %r10, 24(%rdx) -; FALLBACK6-NEXT: movq %r9, 32(%rdx) -; FALLBACK6-NEXT: movq %r8, 40(%rdx) -; FALLBACK6-NEXT: addq $8, %rsp -; FALLBACK6-NEXT: popq %rbx -; FALLBACK6-NEXT: popq %r12 -; FALLBACK6-NEXT: popq %r14 -; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: shl_64bytes: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: pushq %r15 -; FALLBACK7-NEXT: pushq %r14 -; FALLBACK7-NEXT: pushq %rbx -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK7-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK7-NEXT: movl (%rsi), %eax -; FALLBACK7-NEXT: xorps %xmm4, %xmm4 -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: andl $56, %ecx -; FALLBACK7-NEXT: andl $56, %eax -; FALLBACK7-NEXT: negl %eax -; FALLBACK7-NEXT: movslq %eax, %r8 -; FALLBACK7-NEXT: movq -32(%rsp,%r8), %rax -; FALLBACK7-NEXT: movq -24(%rsp,%r8), %r9 -; FALLBACK7-NEXT: movq %r9, %rsi -; FALLBACK7-NEXT: shldq %cl, %rax, %rsi -; FALLBACK7-NEXT: movq -40(%rsp,%r8), %rdi -; FALLBACK7-NEXT: shldq %cl, %rdi, %rax -; FALLBACK7-NEXT: movq -48(%rsp,%r8), %r10 -; FALLBACK7-NEXT: shldq %cl, %r10, %rdi -; FALLBACK7-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK7-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK7-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK7-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK7-NEXT: movq %r14, %r15 -; FALLBACK7-NEXT: shldq %cl, %r9, %r15 -; FALLBACK7-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK7-NEXT: shldq %cl, %r14, %r8 -; FALLBACK7-NEXT: shlxq %rcx, %r11, %r9 -; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shldq %cl, %r11, %rbx -; FALLBACK7-NEXT: movq %r8, 56(%rdx) -; FALLBACK7-NEXT: movq %r15, 48(%rdx) -; FALLBACK7-NEXT: movq %rbx, 8(%rdx) -; FALLBACK7-NEXT: movq %r10, 16(%rdx) -; FALLBACK7-NEXT: movq %rdi, 24(%rdx) -; FALLBACK7-NEXT: movq %rax, 32(%rdx) -; FALLBACK7-NEXT: movq %rsi, 40(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: popq %rbx -; FALLBACK7-NEXT: popq %r14 -; FALLBACK7-NEXT: popq %r15 -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: shl_64bytes: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %r15 -; FALLBACK8-NEXT: pushq %r14 -; FALLBACK8-NEXT: pushq %r13 -; FALLBACK8-NEXT: pushq %r12 -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK8-NEXT: movl (%rsi), %ecx -; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: leal (,%rcx,8), %eax -; FALLBACK8-NEXT: andl $56, %eax -; FALLBACK8-NEXT: andl $56, %ecx -; FALLBACK8-NEXT: negl %ecx -; FALLBACK8-NEXT: movslq %ecx, %r9 -; FALLBACK8-NEXT: movq -24(%rsp,%r9), %rdi -; FALLBACK8-NEXT: movq %rdi, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: movq -32(%rsp,%r9), %r11 -; FALLBACK8-NEXT: movq %r11, %r8 -; FALLBACK8-NEXT: shrq %r8 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: orq %r10, %r8 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9), %rbx -; FALLBACK8-NEXT: movq %rbx, %r10 -; FALLBACK8-NEXT: shrq %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: orq %r11, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r15 -; FALLBACK8-NEXT: movq %r15, %r11 -; FALLBACK8-NEXT: shrq %r11 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: orq %rbx, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r15 -; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r14 -; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r12 -; FALLBACK8-NEXT: movq %r12, %rbx -; FALLBACK8-NEXT: shrq %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbx -; FALLBACK8-NEXT: orq %r15, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r12 -; FALLBACK8-NEXT: movq %r14, %r15 -; FALLBACK8-NEXT: shrq %r15 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r15 -; FALLBACK8-NEXT: orq %r12, %r15 -; FALLBACK8-NEXT: movq -16(%rsp,%r9), %r12 -; FALLBACK8-NEXT: movq %r12, %r13 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r13 -; FALLBACK8-NEXT: shrq %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %rdi -; FALLBACK8-NEXT: orq %r13, %rdi -; FALLBACK8-NEXT: movq -8(%rsp,%r9), %r9 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r9 -; FALLBACK8-NEXT: shrq %r12 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r12 -; FALLBACK8-NEXT: orq %r9, %r12 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r14 -; FALLBACK8-NEXT: movq %r14, (%rdx) -; FALLBACK8-NEXT: movq %r12, 56(%rdx) -; FALLBACK8-NEXT: movq %rdi, 48(%rdx) -; FALLBACK8-NEXT: movq %r15, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) -; FALLBACK8-NEXT: movq %r11, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 32(%rdx) -; FALLBACK8-NEXT: movq %r8, 40(%rdx) -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: popq %r12 -; FALLBACK8-NEXT: popq %r13 -; FALLBACK8-NEXT: popq %r14 -; FALLBACK8-NEXT: popq %r15 -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: shl_64bytes: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: pushq %r15 -; FALLBACK9-NEXT: pushq %r14 -; FALLBACK9-NEXT: pushq %rbx -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK9-NEXT: movl (%rsi), %eax -; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: andl $56, %ecx -; FALLBACK9-NEXT: andl $56, %eax -; FALLBACK9-NEXT: negl %eax -; FALLBACK9-NEXT: movslq %eax, %r8 -; FALLBACK9-NEXT: movq -32(%rsp,%r8), %rax -; FALLBACK9-NEXT: movq -24(%rsp,%r8), %r9 -; FALLBACK9-NEXT: movq %r9, %rsi -; FALLBACK9-NEXT: shldq %cl, %rax, %rsi -; FALLBACK9-NEXT: movq -40(%rsp,%r8), %rdi -; FALLBACK9-NEXT: shldq %cl, %rdi, %rax -; FALLBACK9-NEXT: movq -48(%rsp,%r8), %r10 -; FALLBACK9-NEXT: shldq %cl, %r10, %rdi -; FALLBACK9-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK9-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK9-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK9-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK9-NEXT: movq %r14, %r15 -; FALLBACK9-NEXT: shldq %cl, %r9, %r15 -; FALLBACK9-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK9-NEXT: shldq %cl, %r14, %r8 -; FALLBACK9-NEXT: movq %r11, %r9 -; FALLBACK9-NEXT: shlq %cl, %r9 -; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: shldq %cl, %r11, %rbx -; FALLBACK9-NEXT: movq %r8, 56(%rdx) -; FALLBACK9-NEXT: movq %r15, 48(%rdx) -; FALLBACK9-NEXT: movq %rbx, 8(%rdx) -; FALLBACK9-NEXT: movq %r10, 16(%rdx) -; FALLBACK9-NEXT: movq %rdi, 24(%rdx) -; FALLBACK9-NEXT: movq %rax, 32(%rdx) -; FALLBACK9-NEXT: movq %rsi, 40(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: popq %rbx -; FALLBACK9-NEXT: popq %r14 -; FALLBACK9-NEXT: popq %r15 -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: shl_64bytes: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %r15 -; FALLBACK10-NEXT: pushq %r14 -; FALLBACK10-NEXT: pushq %r12 -; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: pushq %rax -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK10-NEXT: movl (%rsi), %esi -; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rsi,8), %eax -; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: movl %eax, %ecx -; FALLBACK10-NEXT: andl $56, %esi -; FALLBACK10-NEXT: negl %esi -; FALLBACK10-NEXT: movslq %esi, %rsi -; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %rdi -; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r9 -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r8 -; FALLBACK10-NEXT: shlxq %rcx, %r8, %r10 -; FALLBACK10-NEXT: shrq %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 -; FALLBACK10-NEXT: orq %r9, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r9 -; FALLBACK10-NEXT: shlxq %rcx, %r9, %r11 -; FALLBACK10-NEXT: shrq %r9 -; FALLBACK10-NEXT: shrxq %rax, %r9, %r9 -; FALLBACK10-NEXT: orq %r10, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %r10 -; FALLBACK10-NEXT: shlxq %rcx, %r10, %r14 -; FALLBACK10-NEXT: shrq %r10 -; FALLBACK10-NEXT: shrxq %rax, %r10, %r10 -; FALLBACK10-NEXT: orq %r11, %r10 -; FALLBACK10-NEXT: movq -64(%rsp,%rsi), %rbx -; FALLBACK10-NEXT: movq -56(%rsp,%rsi), %r11 -; FALLBACK10-NEXT: shlxq %rcx, %r11, %r15 -; FALLBACK10-NEXT: shrq %r11 -; FALLBACK10-NEXT: shrxq %rax, %r11, %r11 -; FALLBACK10-NEXT: orq %r14, %r11 -; FALLBACK10-NEXT: shlxq %rcx, %rbx, %r14 -; FALLBACK10-NEXT: shrq %rbx -; FALLBACK10-NEXT: shrxq %rax, %rbx, %rbx -; FALLBACK10-NEXT: orq %r15, %rbx -; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %r15 -; FALLBACK10-NEXT: shlxq %rcx, %r15, %r12 -; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %r12, %rdi -; FALLBACK10-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx -; FALLBACK10-NEXT: shrq %r15 -; FALLBACK10-NEXT: shrxq %rax, %r15, %rax -; FALLBACK10-NEXT: orq %rcx, %rax -; FALLBACK10-NEXT: movq %r14, (%rdx) -; FALLBACK10-NEXT: movq %rax, 56(%rdx) -; FALLBACK10-NEXT: movq %rdi, 48(%rdx) -; FALLBACK10-NEXT: movq %rbx, 8(%rdx) -; FALLBACK10-NEXT: movq %r11, 16(%rdx) -; FALLBACK10-NEXT: movq %r10, 24(%rdx) -; FALLBACK10-NEXT: movq %r9, 32(%rdx) -; FALLBACK10-NEXT: movq %r8, 40(%rdx) -; FALLBACK10-NEXT: addq $8, %rsp -; FALLBACK10-NEXT: popq %rbx -; FALLBACK10-NEXT: popq %r12 -; FALLBACK10-NEXT: popq %r14 -; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: shl_64bytes: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: pushq %r15 -; FALLBACK11-NEXT: pushq %r14 -; FALLBACK11-NEXT: pushq %rbx -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK11-NEXT: movl (%rsi), %eax -; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: andl $56, %ecx -; FALLBACK11-NEXT: andl $56, %eax -; FALLBACK11-NEXT: negl %eax -; FALLBACK11-NEXT: movslq %eax, %r8 -; FALLBACK11-NEXT: movq -32(%rsp,%r8), %rax -; FALLBACK11-NEXT: movq -24(%rsp,%r8), %r9 -; FALLBACK11-NEXT: movq %r9, %rsi -; FALLBACK11-NEXT: shldq %cl, %rax, %rsi -; FALLBACK11-NEXT: movq -40(%rsp,%r8), %rdi -; FALLBACK11-NEXT: shldq %cl, %rdi, %rax -; FALLBACK11-NEXT: movq -48(%rsp,%r8), %r10 -; FALLBACK11-NEXT: shldq %cl, %r10, %rdi -; FALLBACK11-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK11-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK11-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK11-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK11-NEXT: movq %r14, %r15 -; FALLBACK11-NEXT: shldq %cl, %r9, %r15 -; FALLBACK11-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK11-NEXT: shldq %cl, %r14, %r8 -; FALLBACK11-NEXT: shlxq %rcx, %r11, %r9 -; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shldq %cl, %r11, %rbx -; FALLBACK11-NEXT: movq %r8, 56(%rdx) -; FALLBACK11-NEXT: movq %r15, 48(%rdx) -; FALLBACK11-NEXT: movq %rbx, 8(%rdx) -; FALLBACK11-NEXT: movq %r10, 16(%rdx) -; FALLBACK11-NEXT: movq %rdi, 24(%rdx) -; FALLBACK11-NEXT: movq %rax, 32(%rdx) -; FALLBACK11-NEXT: movq %rsi, 40(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: popq %rbx -; FALLBACK11-NEXT: popq %r14 -; FALLBACK11-NEXT: popq %r15 -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: shl_64bytes: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %r15 -; FALLBACK12-NEXT: pushq %r14 -; FALLBACK12-NEXT: pushq %r13 -; FALLBACK12-NEXT: pushq %r12 -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK12-NEXT: movl (%rsi), %ecx -; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: leal (,%rcx,8), %eax -; FALLBACK12-NEXT: andl $56, %eax -; FALLBACK12-NEXT: andl $56, %ecx -; FALLBACK12-NEXT: negl %ecx -; FALLBACK12-NEXT: movslq %ecx, %r9 -; FALLBACK12-NEXT: movq -24(%rsp,%r9), %rdi -; FALLBACK12-NEXT: movq %rdi, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: movq -32(%rsp,%r9), %r11 -; FALLBACK12-NEXT: movq %r11, %r8 -; FALLBACK12-NEXT: shrq %r8 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: orq %r10, %r8 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9), %rbx -; FALLBACK12-NEXT: movq %rbx, %r10 -; FALLBACK12-NEXT: shrq %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: orq %r11, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r15 -; FALLBACK12-NEXT: movq %r15, %r11 -; FALLBACK12-NEXT: shrq %r11 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r15 -; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r14 -; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r12 -; FALLBACK12-NEXT: movq %r12, %rbx -; FALLBACK12-NEXT: shrq %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: orq %r15, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r12 -; FALLBACK12-NEXT: movq %r14, %r15 -; FALLBACK12-NEXT: shrq %r15 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r15 -; FALLBACK12-NEXT: orq %r12, %r15 -; FALLBACK12-NEXT: movq -16(%rsp,%r9), %r12 -; FALLBACK12-NEXT: movq %r12, %r13 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r13 -; FALLBACK12-NEXT: shrq %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %rdi -; FALLBACK12-NEXT: orq %r13, %rdi -; FALLBACK12-NEXT: movq -8(%rsp,%r9), %r9 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r9 -; FALLBACK12-NEXT: shrq %r12 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r12 -; FALLBACK12-NEXT: orq %r9, %r12 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r14 -; FALLBACK12-NEXT: movq %r14, (%rdx) -; FALLBACK12-NEXT: movq %r12, 56(%rdx) -; FALLBACK12-NEXT: movq %rdi, 48(%rdx) -; FALLBACK12-NEXT: movq %r15, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) -; FALLBACK12-NEXT: movq %r11, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 32(%rdx) -; FALLBACK12-NEXT: movq %r8, 40(%rdx) -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: popq %r12 -; FALLBACK12-NEXT: popq %r13 -; FALLBACK12-NEXT: popq %r14 -; FALLBACK12-NEXT: popq %r15 -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: shl_64bytes: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: pushq %r15 -; FALLBACK13-NEXT: pushq %r14 -; FALLBACK13-NEXT: pushq %rbx -; FALLBACK13-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK13-NEXT: movl (%rsi), %eax -; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: leal (,%rax,8), %ecx -; FALLBACK13-NEXT: andl $56, %ecx -; FALLBACK13-NEXT: andl $56, %eax -; FALLBACK13-NEXT: negl %eax -; FALLBACK13-NEXT: movslq %eax, %r8 -; FALLBACK13-NEXT: movq -32(%rsp,%r8), %rax -; FALLBACK13-NEXT: movq -24(%rsp,%r8), %r9 -; FALLBACK13-NEXT: movq %r9, %rsi -; FALLBACK13-NEXT: shldq %cl, %rax, %rsi -; FALLBACK13-NEXT: movq -40(%rsp,%r8), %rdi -; FALLBACK13-NEXT: shldq %cl, %rdi, %rax -; FALLBACK13-NEXT: movq -48(%rsp,%r8), %r10 -; FALLBACK13-NEXT: shldq %cl, %r10, %rdi -; FALLBACK13-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK13-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK13-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK13-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK13-NEXT: movq %r14, %r15 -; FALLBACK13-NEXT: shldq %cl, %r9, %r15 -; FALLBACK13-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK13-NEXT: shldq %cl, %r14, %r8 -; FALLBACK13-NEXT: movq %r11, %r9 -; FALLBACK13-NEXT: shlq %cl, %r9 -; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: shldq %cl, %r11, %rbx -; FALLBACK13-NEXT: movq %r8, 56(%rdx) -; FALLBACK13-NEXT: movq %r15, 48(%rdx) -; FALLBACK13-NEXT: movq %rbx, 8(%rdx) -; FALLBACK13-NEXT: movq %r10, 16(%rdx) -; FALLBACK13-NEXT: movq %rdi, 24(%rdx) -; FALLBACK13-NEXT: movq %rax, 32(%rdx) -; FALLBACK13-NEXT: movq %rsi, 40(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: popq %rbx -; FALLBACK13-NEXT: popq %r14 -; FALLBACK13-NEXT: popq %r15 -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: shl_64bytes: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %r15 -; FALLBACK14-NEXT: pushq %r14 -; FALLBACK14-NEXT: pushq %r12 -; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: pushq %rax -; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK14-NEXT: movl (%rsi), %esi -; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rsi,8), %eax -; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: movl %eax, %ecx -; FALLBACK14-NEXT: andl $56, %esi -; FALLBACK14-NEXT: negl %esi -; FALLBACK14-NEXT: movslq %esi, %rsi -; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r9 -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r8 -; FALLBACK14-NEXT: shlxq %rcx, %r8, %r10 -; FALLBACK14-NEXT: shrq %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 -; FALLBACK14-NEXT: orq %r9, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r9 -; FALLBACK14-NEXT: shlxq %rcx, %r9, %r11 -; FALLBACK14-NEXT: shrq %r9 -; FALLBACK14-NEXT: shrxq %rax, %r9, %r9 -; FALLBACK14-NEXT: orq %r10, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %r10 -; FALLBACK14-NEXT: shlxq %rcx, %r10, %r14 -; FALLBACK14-NEXT: shrq %r10 -; FALLBACK14-NEXT: shrxq %rax, %r10, %r10 -; FALLBACK14-NEXT: orq %r11, %r10 -; FALLBACK14-NEXT: movq -64(%rsp,%rsi), %rbx -; FALLBACK14-NEXT: movq -56(%rsp,%rsi), %r11 -; FALLBACK14-NEXT: shlxq %rcx, %r11, %r15 -; FALLBACK14-NEXT: shrq %r11 -; FALLBACK14-NEXT: shrxq %rax, %r11, %r11 -; FALLBACK14-NEXT: orq %r14, %r11 -; FALLBACK14-NEXT: shlxq %rcx, %rbx, %r14 -; FALLBACK14-NEXT: shrq %rbx -; FALLBACK14-NEXT: shrxq %rax, %rbx, %rbx -; FALLBACK14-NEXT: orq %r15, %rbx -; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %r15 -; FALLBACK14-NEXT: shlxq %rcx, %r15, %r12 -; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %r12, %rdi -; FALLBACK14-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx -; FALLBACK14-NEXT: shrq %r15 -; FALLBACK14-NEXT: shrxq %rax, %r15, %rax -; FALLBACK14-NEXT: orq %rcx, %rax -; FALLBACK14-NEXT: movq %r14, (%rdx) -; FALLBACK14-NEXT: movq %rax, 56(%rdx) -; FALLBACK14-NEXT: movq %rdi, 48(%rdx) -; FALLBACK14-NEXT: movq %rbx, 8(%rdx) -; FALLBACK14-NEXT: movq %r11, 16(%rdx) -; FALLBACK14-NEXT: movq %r10, 24(%rdx) -; FALLBACK14-NEXT: movq %r9, 32(%rdx) -; FALLBACK14-NEXT: movq %r8, 40(%rdx) -; FALLBACK14-NEXT: addq $8, %rsp -; FALLBACK14-NEXT: popq %rbx -; FALLBACK14-NEXT: popq %r12 -; FALLBACK14-NEXT: popq %r14 -; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: shl_64bytes: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: pushq %r15 -; FALLBACK15-NEXT: pushq %r14 -; FALLBACK15-NEXT: pushq %rbx -; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK15-NEXT: movl (%rsi), %eax -; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: andl $56, %ecx -; FALLBACK15-NEXT: andl $56, %eax -; FALLBACK15-NEXT: negl %eax -; FALLBACK15-NEXT: movslq %eax, %r8 -; FALLBACK15-NEXT: movq -32(%rsp,%r8), %rax -; FALLBACK15-NEXT: movq -24(%rsp,%r8), %r9 -; FALLBACK15-NEXT: movq %r9, %rsi -; FALLBACK15-NEXT: shldq %cl, %rax, %rsi -; FALLBACK15-NEXT: movq -40(%rsp,%r8), %rdi -; FALLBACK15-NEXT: shldq %cl, %rdi, %rax -; FALLBACK15-NEXT: movq -48(%rsp,%r8), %r10 -; FALLBACK15-NEXT: shldq %cl, %r10, %rdi -; FALLBACK15-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK15-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK15-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK15-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK15-NEXT: movq %r14, %r15 -; FALLBACK15-NEXT: shldq %cl, %r9, %r15 -; FALLBACK15-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK15-NEXT: shldq %cl, %r14, %r8 -; FALLBACK15-NEXT: shlxq %rcx, %r11, %r9 -; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shldq %cl, %r11, %rbx -; FALLBACK15-NEXT: movq %r8, 56(%rdx) -; FALLBACK15-NEXT: movq %r15, 48(%rdx) -; FALLBACK15-NEXT: movq %rbx, 8(%rdx) -; FALLBACK15-NEXT: movq %r10, 16(%rdx) -; FALLBACK15-NEXT: movq %rdi, 24(%rdx) -; FALLBACK15-NEXT: movq %rax, 32(%rdx) -; FALLBACK15-NEXT: movq %rsi, 40(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: popq %rbx -; FALLBACK15-NEXT: popq %r14 -; FALLBACK15-NEXT: popq %r15 -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq -; -; FALLBACK16-LABEL: shl_64bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $204, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl (%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 16(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 20(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 24(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 28(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 40(%eax), %ebp -; FALLBACK16-NEXT: movl 44(%eax), %ebx -; FALLBACK16-NEXT: movl 48(%eax), %edi -; FALLBACK16-NEXT: movl 52(%eax), %esi -; FALLBACK16-NEXT: movl 56(%eax), %edx -; FALLBACK16-NEXT: movl 60(%eax), %ecx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl (%eax), %eax -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %eax, %edx -; FALLBACK16-NEXT: andl $60, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: subl %edx, %ecx -; FALLBACK16-NEXT: movl (%ecx), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ecx), %edx -; FALLBACK16-NEXT: movl %ecx, %ebp -; FALLBACK16-NEXT: shll $3, %eax -; FALLBACK16-NEXT: andl $24, %eax -; FALLBACK16-NEXT: movl %edx, %esi -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %al, %ch -; FALLBACK16-NEXT: notb %ch -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 8(%ebp), %esi -; FALLBACK16-NEXT: movl %ebp, %edi -; FALLBACK16-NEXT: movl %esi, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %esi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: movl 20(%edi), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 16(%edi), %esi -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: movl 28(%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 24(%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %edi -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %esi, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%edx), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 32(%edx), %esi -; FALLBACK16-NEXT: movl %edx, %ebp -; FALLBACK16-NEXT: movl %esi, %edi -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %esi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 44(%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 40(%ebp), %esi -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %esi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 52(%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %edi -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: negl %edx -; FALLBACK16-NEXT: movl 176(%esp,%edx), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK16-NEXT: movl 60(%edi), %edx -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: movl 56(%edi), %ebx -; FALLBACK16-NEXT: movl %ebx, %edi -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %edx, %edi -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: shrl %esi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: orl %ebx, %esi -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %edx, (%eax) -; FALLBACK16-NEXT: movl %esi, 56(%eax) -; FALLBACK16-NEXT: movl %edi, 60(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 48(%eax) -; FALLBACK16-NEXT: movl %ebp, 52(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 40(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 44(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 32(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 36(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 24(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 28(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 16(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 20(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 8(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 12(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $204, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: shl_64bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $188, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 12(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 16(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 20(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 24(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 28(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 36(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%ecx), %ebp -; FALLBACK17-NEXT: movl 44(%ecx), %ebx -; FALLBACK17-NEXT: movl 48(%ecx), %edi -; FALLBACK17-NEXT: movl 52(%ecx), %esi -; FALLBACK17-NEXT: movl 56(%ecx), %edx -; FALLBACK17-NEXT: movl 60(%ecx), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %ecx -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ecx, %ebp -; FALLBACK17-NEXT: andl $60, %ebp -; FALLBACK17-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: subl %ebp, %eax -; FALLBACK17-NEXT: movl 8(%eax), %esi -; FALLBACK17-NEXT: movl 12(%eax), %edx -; FALLBACK17-NEXT: shll $3, %ecx -; FALLBACK17-NEXT: andl $24, %ecx -; FALLBACK17-NEXT: movl %edx, %edi -; FALLBACK17-NEXT: shldl %cl, %esi, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%eax), %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %edi, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 16(%eax), %edi -; FALLBACK17-NEXT: movl 20(%eax), %esi -; FALLBACK17-NEXT: movl %esi, %ebx -; FALLBACK17-NEXT: shldl %cl, %edi, %ebx -; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %edx, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 24(%eax), %edi -; FALLBACK17-NEXT: movl 28(%eax), %edx -; FALLBACK17-NEXT: movl %edx, %ebx -; FALLBACK17-NEXT: shldl %cl, %edi, %ebx -; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %esi, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%eax), %edi -; FALLBACK17-NEXT: movl 36(%eax), %esi -; FALLBACK17-NEXT: movl %esi, %ebx -; FALLBACK17-NEXT: shldl %cl, %edi, %ebx -; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %edx, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%eax), %edx -; FALLBACK17-NEXT: movl 44(%eax), %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %edx, %edi -; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 56(%eax), %edx -; FALLBACK17-NEXT: movl 60(%eax), %edi -; FALLBACK17-NEXT: shldl %cl, %edx, %edi -; FALLBACK17-NEXT: movl (%eax), %ebx -; FALLBACK17-NEXT: movl 52(%eax), %esi -; FALLBACK17-NEXT: shldl %cl, %esi, %edx -; FALLBACK17-NEXT: negl %ebp -; FALLBACK17-NEXT: movl 160(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 56(%ebp) -; FALLBACK17-NEXT: movl %edi, 60(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shldl %cl, %ebx, %edx -; FALLBACK17-NEXT: shll %cl, %ebx -; FALLBACK17-NEXT: shldl %cl, %eax, %esi -; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK17-NEXT: shldl %cl, %edi, %eax -; FALLBACK17-NEXT: movl %eax, 48(%ebp) -; FALLBACK17-NEXT: movl %esi, 52(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 40(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 44(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 32(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 36(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 24(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 16(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %ebx, (%ebp) -; FALLBACK17-NEXT: movl %edx, 4(%ebp) -; FALLBACK17-NEXT: addl $188, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: shl_64bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $204, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 12(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 20(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 28(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 36(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ebx -; FALLBACK18-NEXT: movl 44(%eax), %edi -; FALLBACK18-NEXT: movl 48(%eax), %esi -; FALLBACK18-NEXT: movl 52(%eax), %edx -; FALLBACK18-NEXT: movl 56(%eax), %ecx -; FALLBACK18-NEXT: movl 60(%eax), %eax -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK18-NEXT: movl (%ebp), %ebp -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: leal (,%ebp,8), %ebx -; FALLBACK18-NEXT: andl $24, %ebx -; FALLBACK18-NEXT: movl %ebx, %eax -; FALLBACK18-NEXT: andl $60, %ebp -; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edx -; FALLBACK18-NEXT: subl %ebp, %edx -; FALLBACK18-NEXT: movl (%edx), %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%edx), %ecx -; FALLBACK18-NEXT: notb %bl -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %edi -; FALLBACK18-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK18-NEXT: movl %eax, %ebp -; FALLBACK18-NEXT: orl %esi, %edi -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%edx), %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 12(%edx), %esi -; FALLBACK18-NEXT: movl %ebp, %edi -; FALLBACK18-NEXT: shlxl %ebp, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%edx), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 20(%edx), %ecx -; FALLBACK18-NEXT: shlxl %edi, %ecx, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%edx), %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 28(%edx), %esi -; FALLBACK18-NEXT: shlxl %edi, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%edx), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 36(%edx), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edi, %ecx, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %edi, %eax -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %ebp, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%edx), %edi -; FALLBACK18-NEXT: movl %edi, %esi -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %ecx -; FALLBACK18-NEXT: movl 44(%edx), %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %eax, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %eax, %edi, %edi -; FALLBACK18-NEXT: movl %eax, %esi -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%edx), %ebp -; FALLBACK18-NEXT: movl %ebp, %edi -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 52(%edx), %ecx -; FALLBACK18-NEXT: shlxl %esi, %ecx, %edi -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %esi, %ebp, %edi -; FALLBACK18-NEXT: movl %esi, %ebp -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %edi, %esi -; FALLBACK18-NEXT: movl 56(%edx), %edi -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: shlxl %ebp, %edi, %ecx -; FALLBACK18-NEXT: orl %ecx, %eax -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ebx, %edi, %ecx -; FALLBACK18-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK18-NEXT: negl %ebx -; FALLBACK18-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx -; FALLBACK18-NEXT: orl %ecx, %ebx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK18-NEXT: movl %edi, (%edx) -; FALLBACK18-NEXT: movl %eax, 56(%edx) -; FALLBACK18-NEXT: movl %ebx, 60(%edx) -; FALLBACK18-NEXT: movl %esi, 48(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 52(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 40(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 44(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 32(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 36(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 24(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 28(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 16(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 20(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 8(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 12(%edx) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, 4(%edx) -; FALLBACK18-NEXT: addl $204, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: shl_64bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $204, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl (%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 12(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 16(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 20(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 24(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 28(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 36(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%ebp), %ebx -; FALLBACK19-NEXT: movl 44(%ebp), %edi -; FALLBACK19-NEXT: movl 48(%ebp), %esi -; FALLBACK19-NEXT: movl 52(%ebp), %edx -; FALLBACK19-NEXT: movl 56(%ebp), %ecx -; FALLBACK19-NEXT: movl 60(%ebp), %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl (%ebp), %ebp -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: leal (,%ebp,8), %ecx -; FALLBACK19-NEXT: andl $24, %ecx -; FALLBACK19-NEXT: andl $60, %ebp -; FALLBACK19-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: subl %ebp, %eax -; FALLBACK19-NEXT: movl 4(%eax), %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%eax), %edi -; FALLBACK19-NEXT: movl 12(%eax), %edx -; FALLBACK19-NEXT: movl %edx, %ebx -; FALLBACK19-NEXT: shldl %cl, %edi, %ebx -; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %esi, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 16(%eax), %edi -; FALLBACK19-NEXT: movl 20(%eax), %esi -; FALLBACK19-NEXT: movl %esi, %ebx -; FALLBACK19-NEXT: shldl %cl, %edi, %ebx -; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 24(%eax), %edi -; FALLBACK19-NEXT: movl 28(%eax), %edx -; FALLBACK19-NEXT: movl %edx, %ebx -; FALLBACK19-NEXT: shldl %cl, %edi, %ebx -; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %esi, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%eax), %edi -; FALLBACK19-NEXT: movl 36(%eax), %esi -; FALLBACK19-NEXT: movl %esi, %ebx -; FALLBACK19-NEXT: shldl %cl, %edi, %ebx -; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%eax), %ebx -; FALLBACK19-NEXT: movl 44(%eax), %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %esi, %ebx -; FALLBACK19-NEXT: movl 56(%eax), %edx -; FALLBACK19-NEXT: movl 60(%eax), %edi -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: movl (%eax), %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 52(%eax), %esi -; FALLBACK19-NEXT: shldl %cl, %esi, %edx -; FALLBACK19-NEXT: negl %ebp -; FALLBACK19-NEXT: movl 176(%esp,%ebp), %ebp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: movl %edx, 56(%eax) -; FALLBACK19-NEXT: movl %edi, 60(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: shldl %cl, %ebp, %esi -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: shldl %cl, %edx, %ebp -; FALLBACK19-NEXT: movl %ebp, 48(%eax) -; FALLBACK19-NEXT: movl %esi, 52(%eax) -; FALLBACK19-NEXT: movl %ebx, 40(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 44(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 32(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 36(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 24(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 28(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 16(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 20(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 8(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 12(%eax) -; FALLBACK19-NEXT: movl %edi, 4(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, (%eax) -; FALLBACK19-NEXT: addl $204, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: shl_64bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $204, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK20-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK20-NEXT: movl (%eax), %eax -; FALLBACK20-NEXT: xorps %xmm4, %xmm4 -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: andl $60, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: subl %edx, %ecx -; FALLBACK20-NEXT: movl (%ecx), %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 4(%ecx), %edx -; FALLBACK20-NEXT: movl %ecx, %ebp -; FALLBACK20-NEXT: shll $3, %eax -; FALLBACK20-NEXT: andl $24, %eax -; FALLBACK20-NEXT: movl %edx, %esi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %al, %ch -; FALLBACK20-NEXT: notb %ch -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %esi, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 12(%ebp), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 8(%ebp), %esi -; FALLBACK20-NEXT: movl %ebp, %edi -; FALLBACK20-NEXT: movl %esi, %ebp -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %esi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %edi, %ebp -; FALLBACK20-NEXT: movl 20(%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 16(%edi), %esi -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %esi, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %ebp, %edx -; FALLBACK20-NEXT: movl 28(%ebp), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 24(%ebp), %esi -; FALLBACK20-NEXT: movl %esi, %edi -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %esi, %ebp -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 36(%edx), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 32(%edx), %esi -; FALLBACK20-NEXT: movl %edx, %ebp -; FALLBACK20-NEXT: movl %esi, %edi -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %esi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 44(%ebp), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 40(%ebp), %esi -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %esi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 52(%ebp), %esi -; FALLBACK20-NEXT: movl %esi, %edi -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: negl %edx -; FALLBACK20-NEXT: movl 176(%esp,%edx), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %edi, %ebp -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK20-NEXT: movl 60(%edi), %edx -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: movl 56(%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, %edi -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %edx, %edi -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: shrl %esi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl %edx, (%eax) -; FALLBACK20-NEXT: movl %esi, 56(%eax) -; FALLBACK20-NEXT: movl %edi, 60(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 48(%eax) -; FALLBACK20-NEXT: movl %ebp, 52(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 40(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 44(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 32(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 36(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 24(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 28(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 16(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 20(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 8(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 12(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 4(%eax) -; FALLBACK20-NEXT: addl $204, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: shl_64bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $188, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movups (%ecx), %xmm0 -; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK21-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK21-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK21-NEXT: movl (%eax), %ecx -; FALLBACK21-NEXT: xorps %xmm4, %xmm4 -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ecx, %ebp -; FALLBACK21-NEXT: andl $60, %ebp -; FALLBACK21-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: subl %ebp, %eax -; FALLBACK21-NEXT: movl 8(%eax), %esi -; FALLBACK21-NEXT: movl 12(%eax), %edx -; FALLBACK21-NEXT: shll $3, %ecx -; FALLBACK21-NEXT: andl $24, %ecx -; FALLBACK21-NEXT: movl %edx, %edi -; FALLBACK21-NEXT: shldl %cl, %esi, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 4(%eax), %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %edi, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 16(%eax), %edi -; FALLBACK21-NEXT: movl 20(%eax), %esi -; FALLBACK21-NEXT: movl %esi, %ebx -; FALLBACK21-NEXT: shldl %cl, %edi, %ebx -; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %edx, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 24(%eax), %edi -; FALLBACK21-NEXT: movl 28(%eax), %edx -; FALLBACK21-NEXT: movl %edx, %ebx -; FALLBACK21-NEXT: shldl %cl, %edi, %ebx -; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %esi, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 32(%eax), %edi -; FALLBACK21-NEXT: movl 36(%eax), %esi -; FALLBACK21-NEXT: movl %esi, %ebx -; FALLBACK21-NEXT: shldl %cl, %edi, %ebx -; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %edx, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 40(%eax), %edx -; FALLBACK21-NEXT: movl 44(%eax), %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %edx, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 56(%eax), %edx -; FALLBACK21-NEXT: movl 60(%eax), %edi -; FALLBACK21-NEXT: shldl %cl, %edx, %edi -; FALLBACK21-NEXT: movl (%eax), %ebx -; FALLBACK21-NEXT: movl 52(%eax), %esi -; FALLBACK21-NEXT: shldl %cl, %esi, %edx -; FALLBACK21-NEXT: negl %ebp -; FALLBACK21-NEXT: movl 160(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %edx, 56(%ebp) -; FALLBACK21-NEXT: movl %edi, 60(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK21-NEXT: shldl %cl, %ebx, %edx -; FALLBACK21-NEXT: shll %cl, %ebx -; FALLBACK21-NEXT: shldl %cl, %eax, %esi -; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK21-NEXT: shldl %cl, %edi, %eax -; FALLBACK21-NEXT: movl %eax, 48(%ebp) -; FALLBACK21-NEXT: movl %esi, 52(%ebp) -; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 40(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 44(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 32(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 36(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 24(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %ebx, (%ebp) -; FALLBACK21-NEXT: movl %edx, 4(%ebp) -; FALLBACK21-NEXT: addl $188, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: shl_64bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $204, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK22-NEXT: movl (%eax), %eax -; FALLBACK22-NEXT: xorps %xmm4, %xmm4 -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%eax,8), %ebx -; FALLBACK22-NEXT: andl $24, %ebx -; FALLBACK22-NEXT: movl %ebx, %ecx -; FALLBACK22-NEXT: andl $60, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: subl %eax, %edx -; FALLBACK22-NEXT: movl (%edx), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 4(%edx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %edi -; FALLBACK22-NEXT: shlxl %ecx, %eax, %esi -; FALLBACK22-NEXT: orl %esi, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 8(%edx), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK22-NEXT: movl 12(%edx), %esi -; FALLBACK22-NEXT: shlxl %ecx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %edi -; FALLBACK22-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 16(%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 20(%edx), %ecx -; FALLBACK22-NEXT: shlxl %edi, %ecx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 24(%edx), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK22-NEXT: movl 28(%edx), %esi -; FALLBACK22-NEXT: shlxl %edi, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 32(%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 36(%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edi, %ecx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, %eax -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ebp, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 40(%edx), %edi -; FALLBACK22-NEXT: movl %edi, %esi -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %ecx -; FALLBACK22-NEXT: movl 44(%edx), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %edi, %edi -; FALLBACK22-NEXT: movl %eax, %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %edi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%edx), %ebp -; FALLBACK22-NEXT: movl %ebp, %edi -; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 52(%edx), %ecx -; FALLBACK22-NEXT: shlxl %esi, %ecx, %edi -; FALLBACK22-NEXT: orl %edi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %esi, %ebp, %edi -; FALLBACK22-NEXT: movl %esi, %ebp -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl 56(%edx), %edi -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: shlxl %ebp, %edi, %ecx -; FALLBACK22-NEXT: orl %ecx, %eax -; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %ecx -; FALLBACK22-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK22-NEXT: negl %ebx -; FALLBACK22-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx -; FALLBACK22-NEXT: orl %ecx, %ebx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %edi, (%edx) -; FALLBACK22-NEXT: movl %eax, 56(%edx) -; FALLBACK22-NEXT: movl %ebx, 60(%edx) -; FALLBACK22-NEXT: movl %esi, 48(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 52(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 40(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 44(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 32(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 36(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 24(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 28(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 16(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 8(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 12(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 4(%edx) -; FALLBACK22-NEXT: addl $204, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: shl_64bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $204, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movups (%ecx), %xmm0 -; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK23-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK23-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK23-NEXT: movl (%eax), %ebp -; FALLBACK23-NEXT: xorps %xmm4, %xmm4 -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: leal (,%ebp,8), %ecx -; FALLBACK23-NEXT: andl $24, %ecx -; FALLBACK23-NEXT: andl $60, %ebp -; FALLBACK23-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: subl %ebp, %eax -; FALLBACK23-NEXT: movl 4(%eax), %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 8(%eax), %edi -; FALLBACK23-NEXT: movl 12(%eax), %edx -; FALLBACK23-NEXT: movl %edx, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %esi, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 16(%eax), %edi -; FALLBACK23-NEXT: movl 20(%eax), %esi -; FALLBACK23-NEXT: movl %esi, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %edx, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 24(%eax), %edi -; FALLBACK23-NEXT: movl 28(%eax), %edx -; FALLBACK23-NEXT: movl %edx, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %esi, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 32(%eax), %edi -; FALLBACK23-NEXT: movl 36(%eax), %esi -; FALLBACK23-NEXT: movl %esi, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %edx, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 40(%eax), %ebx -; FALLBACK23-NEXT: movl 44(%eax), %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %ebx, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %esi, %ebx -; FALLBACK23-NEXT: movl 56(%eax), %edx -; FALLBACK23-NEXT: movl 60(%eax), %edi -; FALLBACK23-NEXT: shldl %cl, %edx, %edi -; FALLBACK23-NEXT: movl (%eax), %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 52(%eax), %esi -; FALLBACK23-NEXT: shldl %cl, %esi, %edx -; FALLBACK23-NEXT: negl %ebp -; FALLBACK23-NEXT: movl 176(%esp,%ebp), %ebp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl %edx, 56(%eax) -; FALLBACK23-NEXT: movl %edi, 60(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK23-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK23-NEXT: shldl %cl, %edx, %edi -; FALLBACK23-NEXT: shldl %cl, %ebp, %esi -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK23-NEXT: shldl %cl, %edx, %ebp -; FALLBACK23-NEXT: movl %ebp, 48(%eax) -; FALLBACK23-NEXT: movl %esi, 52(%eax) -; FALLBACK23-NEXT: movl %ebx, 40(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 44(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 32(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 36(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 24(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 28(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 16(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 20(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 8(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 12(%eax) -; FALLBACK23-NEXT: movl %edi, 4(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, (%eax) -; FALLBACK23-NEXT: addl $204, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: shl_64bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $204, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK24-NEXT: movl (%eax), %eax -; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: andl $60, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: subl %edx, %ecx -; FALLBACK24-NEXT: movl (%ecx), %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 4(%ecx), %edx -; FALLBACK24-NEXT: movl %ecx, %ebp -; FALLBACK24-NEXT: shll $3, %eax -; FALLBACK24-NEXT: andl $24, %eax -; FALLBACK24-NEXT: movl %edx, %esi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %al, %ch -; FALLBACK24-NEXT: notb %ch -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %esi, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 12(%ebp), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 8(%ebp), %esi -; FALLBACK24-NEXT: movl %ebp, %edi -; FALLBACK24-NEXT: movl %esi, %ebp -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %esi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %edi, %ebp -; FALLBACK24-NEXT: movl 20(%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 16(%edi), %esi -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %esi, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %ebp, %edx -; FALLBACK24-NEXT: movl 28(%ebp), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 24(%ebp), %esi -; FALLBACK24-NEXT: movl %esi, %edi -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %esi, %ebp -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 36(%edx), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 32(%edx), %esi -; FALLBACK24-NEXT: movl %edx, %ebp -; FALLBACK24-NEXT: movl %esi, %edi -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %esi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 44(%ebp), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 40(%ebp), %esi -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %esi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 52(%ebp), %esi -; FALLBACK24-NEXT: movl %esi, %edi -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: negl %edx -; FALLBACK24-NEXT: movl 176(%esp,%edx), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %edi, %ebp -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK24-NEXT: movl 60(%edi), %edx -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: movl 56(%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, %edi -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %edx, %edi -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: shrl %esi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl %edx, (%eax) -; FALLBACK24-NEXT: movl %esi, 56(%eax) -; FALLBACK24-NEXT: movl %edi, 60(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 48(%eax) -; FALLBACK24-NEXT: movl %ebp, 52(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 40(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 44(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 32(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 36(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 24(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 28(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 16(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 20(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 8(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 12(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 4(%eax) -; FALLBACK24-NEXT: addl $204, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: vzeroupper -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: shl_64bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $188, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK25-NEXT: movl (%eax), %ecx -; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ecx, %ebp -; FALLBACK25-NEXT: andl $60, %ebp -; FALLBACK25-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: subl %ebp, %eax -; FALLBACK25-NEXT: movl 8(%eax), %esi -; FALLBACK25-NEXT: movl 12(%eax), %edx -; FALLBACK25-NEXT: shll $3, %ecx -; FALLBACK25-NEXT: andl $24, %ecx -; FALLBACK25-NEXT: movl %edx, %edi -; FALLBACK25-NEXT: shldl %cl, %esi, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 4(%eax), %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %edi, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 16(%eax), %edi -; FALLBACK25-NEXT: movl 20(%eax), %esi -; FALLBACK25-NEXT: movl %esi, %ebx -; FALLBACK25-NEXT: shldl %cl, %edi, %ebx -; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %edx, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 24(%eax), %edi -; FALLBACK25-NEXT: movl 28(%eax), %edx -; FALLBACK25-NEXT: movl %edx, %ebx -; FALLBACK25-NEXT: shldl %cl, %edi, %ebx -; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %esi, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 32(%eax), %edi -; FALLBACK25-NEXT: movl 36(%eax), %esi -; FALLBACK25-NEXT: movl %esi, %ebx -; FALLBACK25-NEXT: shldl %cl, %edi, %ebx -; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %edx, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 40(%eax), %edx -; FALLBACK25-NEXT: movl 44(%eax), %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %edx, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 56(%eax), %edx -; FALLBACK25-NEXT: movl 60(%eax), %edi -; FALLBACK25-NEXT: shldl %cl, %edx, %edi -; FALLBACK25-NEXT: movl (%eax), %ebx -; FALLBACK25-NEXT: movl 52(%eax), %esi -; FALLBACK25-NEXT: shldl %cl, %esi, %edx -; FALLBACK25-NEXT: negl %ebp -; FALLBACK25-NEXT: movl 160(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %edx, 56(%ebp) -; FALLBACK25-NEXT: movl %edi, 60(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK25-NEXT: shldl %cl, %ebx, %edx -; FALLBACK25-NEXT: shll %cl, %ebx -; FALLBACK25-NEXT: shldl %cl, %eax, %esi -; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK25-NEXT: shldl %cl, %edi, %eax -; FALLBACK25-NEXT: movl %eax, 48(%ebp) -; FALLBACK25-NEXT: movl %esi, 52(%ebp) -; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 40(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 44(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 32(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 36(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 24(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %ebx, (%ebp) -; FALLBACK25-NEXT: movl %edx, 4(%ebp) -; FALLBACK25-NEXT: addl $188, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: vzeroupper -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: shl_64bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $204, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK26-NEXT: movl (%eax), %eax -; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: leal (,%eax,8), %ebx -; FALLBACK26-NEXT: andl $24, %ebx -; FALLBACK26-NEXT: movl %ebx, %ecx -; FALLBACK26-NEXT: andl $60, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: subl %eax, %edx -; FALLBACK26-NEXT: movl (%edx), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 4(%edx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi -; FALLBACK26-NEXT: shlxl %ecx, %eax, %esi -; FALLBACK26-NEXT: orl %esi, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 8(%edx), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK26-NEXT: movl 12(%edx), %esi -; FALLBACK26-NEXT: shlxl %ecx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %ecx, %edi -; FALLBACK26-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %eax, %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 16(%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 20(%edx), %ecx -; FALLBACK26-NEXT: shlxl %edi, %ecx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 24(%edx), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK26-NEXT: movl 28(%edx), %esi -; FALLBACK26-NEXT: shlxl %edi, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 32(%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 36(%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edi, %ecx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, %eax -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %ebp, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 40(%edx), %edi -; FALLBACK26-NEXT: movl %edi, %esi -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %ecx -; FALLBACK26-NEXT: movl 44(%edx), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %edi, %edi -; FALLBACK26-NEXT: movl %eax, %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %edi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%edx), %ebp -; FALLBACK26-NEXT: movl %ebp, %edi -; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %eax -; FALLBACK26-NEXT: movl 52(%edx), %ecx -; FALLBACK26-NEXT: shlxl %esi, %ecx, %edi -; FALLBACK26-NEXT: orl %edi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %esi, %ebp, %edi -; FALLBACK26-NEXT: movl %esi, %ebp -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: movl 56(%edx), %edi -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: shlxl %ebp, %edi, %ecx -; FALLBACK26-NEXT: orl %ecx, %eax -; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ecx -; FALLBACK26-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK26-NEXT: negl %ebx -; FALLBACK26-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx -; FALLBACK26-NEXT: orl %ecx, %ebx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %edi, (%edx) -; FALLBACK26-NEXT: movl %eax, 56(%edx) -; FALLBACK26-NEXT: movl %ebx, 60(%edx) -; FALLBACK26-NEXT: movl %esi, 48(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 52(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 40(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 44(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 32(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 36(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 24(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 28(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 16(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 8(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 4(%edx) -; FALLBACK26-NEXT: addl $204, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: vzeroupper -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: shl_64bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $204, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK27-NEXT: movl (%eax), %ebx -; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: leal (,%ebx,8), %ecx -; FALLBACK27-NEXT: andl $24, %ecx -; FALLBACK27-NEXT: andl $60, %ebx -; FALLBACK27-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: subl %ebx, %eax -; FALLBACK27-NEXT: movl 4(%eax), %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 8(%eax), %edi -; FALLBACK27-NEXT: movl 12(%eax), %edx -; FALLBACK27-NEXT: movl %edx, %ebp -; FALLBACK27-NEXT: shldl %cl, %edi, %ebp -; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %esi, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 16(%eax), %edi -; FALLBACK27-NEXT: movl 20(%eax), %esi -; FALLBACK27-NEXT: movl %esi, %ebp -; FALLBACK27-NEXT: shldl %cl, %edi, %ebp -; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %edx, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 24(%eax), %edi -; FALLBACK27-NEXT: movl 28(%eax), %edx -; FALLBACK27-NEXT: movl %edx, %ebp -; FALLBACK27-NEXT: shldl %cl, %edi, %ebp -; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %esi, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 32(%eax), %edi -; FALLBACK27-NEXT: movl 36(%eax), %esi -; FALLBACK27-NEXT: movl %esi, %ebp -; FALLBACK27-NEXT: shldl %cl, %edi, %ebp -; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %edx, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 40(%eax), %ebp -; FALLBACK27-NEXT: movl 44(%eax), %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %ebp, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %esi, %ebp -; FALLBACK27-NEXT: movl 56(%eax), %edx -; FALLBACK27-NEXT: movl 60(%eax), %edi -; FALLBACK27-NEXT: shldl %cl, %edx, %edi -; FALLBACK27-NEXT: movl (%eax), %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 52(%eax), %esi -; FALLBACK27-NEXT: shldl %cl, %esi, %edx -; FALLBACK27-NEXT: negl %ebx -; FALLBACK27-NEXT: movl 176(%esp,%ebx), %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl %edx, 56(%eax) -; FALLBACK27-NEXT: movl %edi, 60(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK27-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK27-NEXT: shldl %cl, %edx, %edi -; FALLBACK27-NEXT: shldl %cl, %ebx, %esi -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK27-NEXT: shldl %cl, %edx, %ebx -; FALLBACK27-NEXT: movl %ebx, 48(%eax) -; FALLBACK27-NEXT: movl %esi, 52(%eax) -; FALLBACK27-NEXT: movl %ebp, 40(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 44(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 32(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 36(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 24(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 28(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 16(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 20(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 8(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 12(%eax) -; FALLBACK27-NEXT: movl %edi, 4(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, (%eax) -; FALLBACK27-NEXT: addl $204, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: vzeroupper -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: shl_64bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $204, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK28-NEXT: movl (%eax), %eax -; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: andl $60, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: subl %edx, %ecx -; FALLBACK28-NEXT: movl (%ecx), %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 4(%ecx), %edx -; FALLBACK28-NEXT: movl %ecx, %ebp -; FALLBACK28-NEXT: shll $3, %eax -; FALLBACK28-NEXT: andl $24, %eax -; FALLBACK28-NEXT: movl %edx, %esi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %al, %ch -; FALLBACK28-NEXT: notb %ch -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %esi, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 12(%ebp), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 8(%ebp), %esi -; FALLBACK28-NEXT: movl %ebp, %edi -; FALLBACK28-NEXT: movl %esi, %ebp -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %esi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %edi, %ebp -; FALLBACK28-NEXT: movl 20(%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 16(%edi), %esi -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %esi, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %ebp, %edx -; FALLBACK28-NEXT: movl 28(%ebp), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 24(%ebp), %esi -; FALLBACK28-NEXT: movl %esi, %edi -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %esi, %ebp -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 36(%edx), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 32(%edx), %esi -; FALLBACK28-NEXT: movl %edx, %ebp -; FALLBACK28-NEXT: movl %esi, %edi -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %esi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 44(%ebp), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 40(%ebp), %esi -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %esi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 52(%ebp), %esi -; FALLBACK28-NEXT: movl %esi, %edi -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: negl %edx -; FALLBACK28-NEXT: movl 176(%esp,%edx), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %edi, %ebp -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK28-NEXT: movl 60(%edi), %edx -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: movl 56(%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, %edi -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %edx, %edi -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: shrl %esi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl %edx, (%eax) -; FALLBACK28-NEXT: movl %esi, 56(%eax) -; FALLBACK28-NEXT: movl %edi, 60(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 48(%eax) -; FALLBACK28-NEXT: movl %ebp, 52(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 40(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 44(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 32(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 36(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 24(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 28(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 16(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 20(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 8(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 12(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 4(%eax) -; FALLBACK28-NEXT: addl $204, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: vzeroupper -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: shl_64bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $188, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK29-NEXT: movl (%eax), %ecx -; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ecx, %ebp -; FALLBACK29-NEXT: andl $60, %ebp -; FALLBACK29-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: subl %ebp, %eax -; FALLBACK29-NEXT: movl 8(%eax), %esi -; FALLBACK29-NEXT: movl 12(%eax), %edx -; FALLBACK29-NEXT: shll $3, %ecx -; FALLBACK29-NEXT: andl $24, %ecx -; FALLBACK29-NEXT: movl %edx, %edi -; FALLBACK29-NEXT: shldl %cl, %esi, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 4(%eax), %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %edi, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 16(%eax), %edi -; FALLBACK29-NEXT: movl 20(%eax), %esi -; FALLBACK29-NEXT: movl %esi, %ebx -; FALLBACK29-NEXT: shldl %cl, %edi, %ebx -; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %edx, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 24(%eax), %edi -; FALLBACK29-NEXT: movl 28(%eax), %edx -; FALLBACK29-NEXT: movl %edx, %ebx -; FALLBACK29-NEXT: shldl %cl, %edi, %ebx -; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %esi, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 32(%eax), %edi -; FALLBACK29-NEXT: movl 36(%eax), %esi -; FALLBACK29-NEXT: movl %esi, %ebx -; FALLBACK29-NEXT: shldl %cl, %edi, %ebx -; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %edx, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 40(%eax), %edx -; FALLBACK29-NEXT: movl 44(%eax), %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %edx, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 56(%eax), %edx -; FALLBACK29-NEXT: movl 60(%eax), %edi -; FALLBACK29-NEXT: shldl %cl, %edx, %edi -; FALLBACK29-NEXT: movl (%eax), %ebx -; FALLBACK29-NEXT: movl 52(%eax), %esi -; FALLBACK29-NEXT: shldl %cl, %esi, %edx -; FALLBACK29-NEXT: negl %ebp -; FALLBACK29-NEXT: movl 160(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %edx, 56(%ebp) -; FALLBACK29-NEXT: movl %edi, 60(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK29-NEXT: shldl %cl, %ebx, %edx -; FALLBACK29-NEXT: shll %cl, %ebx -; FALLBACK29-NEXT: shldl %cl, %eax, %esi -; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK29-NEXT: shldl %cl, %edi, %eax -; FALLBACK29-NEXT: movl %eax, 48(%ebp) -; FALLBACK29-NEXT: movl %esi, 52(%ebp) -; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 40(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 44(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 32(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 36(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 24(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %ebx, (%ebp) -; FALLBACK29-NEXT: movl %edx, 4(%ebp) -; FALLBACK29-NEXT: addl $188, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: vzeroupper -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: shl_64bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $204, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK30-NEXT: movl (%eax), %eax -; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: leal (,%eax,8), %ebx -; FALLBACK30-NEXT: andl $24, %ebx -; FALLBACK30-NEXT: movl %ebx, %ecx -; FALLBACK30-NEXT: andl $60, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: subl %eax, %edx -; FALLBACK30-NEXT: movl (%edx), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 4(%edx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi -; FALLBACK30-NEXT: shlxl %ecx, %eax, %esi -; FALLBACK30-NEXT: orl %esi, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 8(%edx), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 12(%edx), %esi -; FALLBACK30-NEXT: shlxl %ecx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %ecx, %edi -; FALLBACK30-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %eax, %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 16(%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 20(%edx), %ecx -; FALLBACK30-NEXT: shlxl %edi, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 24(%edx), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 28(%edx), %esi -; FALLBACK30-NEXT: shlxl %edi, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 32(%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 36(%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edi, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, %eax -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %ebp, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 40(%edx), %edi -; FALLBACK30-NEXT: movl %edi, %esi -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %ecx -; FALLBACK30-NEXT: movl 44(%edx), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %edi, %edi -; FALLBACK30-NEXT: movl %eax, %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %edi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%edx), %ebp -; FALLBACK30-NEXT: movl %ebp, %edi -; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 52(%edx), %ecx -; FALLBACK30-NEXT: shlxl %esi, %ecx, %edi -; FALLBACK30-NEXT: orl %edi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %esi, %ebp, %edi -; FALLBACK30-NEXT: movl %esi, %ebp -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: movl 56(%edx), %edi -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: shlxl %ebp, %edi, %ecx -; FALLBACK30-NEXT: orl %ecx, %eax -; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ecx -; FALLBACK30-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK30-NEXT: negl %ebx -; FALLBACK30-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx -; FALLBACK30-NEXT: orl %ecx, %ebx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %edi, (%edx) -; FALLBACK30-NEXT: movl %eax, 56(%edx) -; FALLBACK30-NEXT: movl %ebx, 60(%edx) -; FALLBACK30-NEXT: movl %esi, 48(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 52(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 40(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 44(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 32(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 36(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 24(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 28(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 16(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 8(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 4(%edx) -; FALLBACK30-NEXT: addl $204, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: vzeroupper -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: shl_64bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $204, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK31-NEXT: movl (%eax), %ebx -; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: leal (,%ebx,8), %ecx -; FALLBACK31-NEXT: andl $24, %ecx -; FALLBACK31-NEXT: andl $60, %ebx -; FALLBACK31-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: subl %ebx, %eax -; FALLBACK31-NEXT: movl 4(%eax), %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 8(%eax), %edi -; FALLBACK31-NEXT: movl 12(%eax), %edx -; FALLBACK31-NEXT: movl %edx, %ebp -; FALLBACK31-NEXT: shldl %cl, %edi, %ebp -; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %esi, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 16(%eax), %edi -; FALLBACK31-NEXT: movl 20(%eax), %esi -; FALLBACK31-NEXT: movl %esi, %ebp -; FALLBACK31-NEXT: shldl %cl, %edi, %ebp -; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %edx, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 24(%eax), %edi -; FALLBACK31-NEXT: movl 28(%eax), %edx -; FALLBACK31-NEXT: movl %edx, %ebp -; FALLBACK31-NEXT: shldl %cl, %edi, %ebp -; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %esi, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 32(%eax), %edi -; FALLBACK31-NEXT: movl 36(%eax), %esi -; FALLBACK31-NEXT: movl %esi, %ebp -; FALLBACK31-NEXT: shldl %cl, %edi, %ebp -; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %edx, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 40(%eax), %ebp -; FALLBACK31-NEXT: movl 44(%eax), %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %ebp, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %esi, %ebp -; FALLBACK31-NEXT: movl 56(%eax), %edx -; FALLBACK31-NEXT: movl 60(%eax), %edi -; FALLBACK31-NEXT: shldl %cl, %edx, %edi -; FALLBACK31-NEXT: movl (%eax), %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 52(%eax), %esi -; FALLBACK31-NEXT: shldl %cl, %esi, %edx -; FALLBACK31-NEXT: negl %ebx -; FALLBACK31-NEXT: movl 176(%esp,%ebx), %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl %edx, 56(%eax) -; FALLBACK31-NEXT: movl %edi, 60(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK31-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK31-NEXT: shldl %cl, %edx, %edi -; FALLBACK31-NEXT: shldl %cl, %ebx, %esi -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK31-NEXT: shldl %cl, %edx, %ebx -; FALLBACK31-NEXT: movl %ebx, 48(%eax) -; FALLBACK31-NEXT: movl %esi, 52(%eax) -; FALLBACK31-NEXT: movl %ebp, 40(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 44(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 32(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 36(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 24(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 28(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 16(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 20(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 8(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 12(%eax) -; FALLBACK31-NEXT: movl %edi, 4(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, (%eax) -; FALLBACK31-NEXT: addl $204, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: vzeroupper -; FALLBACK31-NEXT: retl +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: negl %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movslq %esi, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rbx), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rbx), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%rbx), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rbx), %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r14, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r15, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%rbx), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%rbx), %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r13, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r12, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r13, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%rbx), %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%rbx), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r12, %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r13, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negl %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movslq %esi, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%r9), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%r9), %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r11, %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r10, %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%r9), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%r9), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r10, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r8, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addq $8, %rsp +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negl %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movslq %esi, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r9, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r14, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rsi), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rbx, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rsi), %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r15, %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r15, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r14, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r12, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rsi, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r14, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rbx, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rcx, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negl %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movslq %esi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%r8), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%r8), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%r8), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%r8), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%r8), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%r8), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r11, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r9, %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%r8), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rbx, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r10, %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: negl %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movslq %ecx, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r9), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r9), %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9), %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9), %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r15, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r12, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r13, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -8(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negl %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movslq %eax, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r8), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r8), %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r9, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r14, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negl %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movslq %esi, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r9, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rsi), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r10, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r11, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rsi), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rbx, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r15, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%rsi), %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r15, %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r12, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r15, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rcx, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq $8, %rsp +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negl %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movslq %eax, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%r8), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%r8), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r9, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r14, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes: +; X64-NO-SHLD-NO-BMI2-AVX1: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: negl %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movslq %ecx, %r9 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -24(%rsp,%r9), %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -32(%rsp,%r9), %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r8 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -40(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r11, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -48(%rsp,%r9), %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -64(%rsp,%r9), %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -56(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r15, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r12, %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -16(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r13, %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -8(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r9, %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: negl %eax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movslq %eax, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -24(%rsp,%r8), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -40(%rsp,%r8), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -64(%rsp,%r8), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -56(%rsp,%r8), %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -16(%rsp,%r8), %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r9, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r14, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: negl %esi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movslq %esi, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -24(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -32(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r8, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -40(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r9, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r10, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -48(%rsp,%rsi), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r10, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r11, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -64(%rsp,%rsi), %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -56(%rsp,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r11, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rbx, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r15, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -16(%rsp,%rsi), %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r15, %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r12, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r15, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rcx, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq $8, %rsp +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: negl %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movslq %eax, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -24(%rsp,%r8), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -40(%rsp,%r8), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -64(%rsp,%r8), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -56(%rsp,%r8), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -16(%rsp,%r8), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r9, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r14, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes: +; X64-NO-SHLD-NO-BMI2-AVX512: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: negl %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movslq %ecx, %r9 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -24(%rsp,%r9), %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -32(%rsp,%r9), %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r8 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -40(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r11, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -48(%rsp,%r9), %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -64(%rsp,%r9), %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -56(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r15, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r12, %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -16(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r13, %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -8(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r9, %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: negl %eax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movslq %eax, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -24(%rsp,%r8), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -40(%rsp,%r8), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -64(%rsp,%r8), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -56(%rsp,%r8), %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -16(%rsp,%r8), %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r9, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r14, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: negl %esi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movslq %esi, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -24(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -32(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r8, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -40(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r9, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r10, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -48(%rsp,%rsi), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r10, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r11, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -64(%rsp,%rsi), %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -56(%rsp,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rbx, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r15, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -16(%rsp,%rsi), %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r15, %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r12, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r15, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rcx, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq $8, %rsp +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: negl %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movslq %eax, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -24(%rsp,%r8), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -40(%rsp,%r8), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -64(%rsp,%r8), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -56(%rsp,%r8), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -16(%rsp,%r8), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r9, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r14, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retq +; +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%eax), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%eax), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%edx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: negl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 176(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%edi), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 56(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%ecx), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%ecx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%ecx), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%ecx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl %ebp, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 160(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ebp), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%ebp,8), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%edx), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 56(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 60(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 48(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 52(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 40(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 44(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 32(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 36(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%ebp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%ebp,8), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 176(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 56(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 60(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %ebp, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 52(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 40(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 4(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 12(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 8(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%edx), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: negl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 176(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%edi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 56(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl %ebp, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 8(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 4(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 160(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%eax,8), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl %eax, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 4(%edx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %bl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 8(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 12(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 32(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%edx), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %esi, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %esi, %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebp, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ecx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, (%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 56(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 60(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 48(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 52(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 40(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 44(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 32(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 36(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%ebp,8), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 4(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 8(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 176(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 56(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 60(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %ebp, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 52(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 40(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes: +; X86-NO-SHLD-NO-BMI2-AVX1: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: subl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%ecx), %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 4(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 12(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 8(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 20(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 16(%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 28(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 24(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 36(%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 32(%edx), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 44(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 40(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: negl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 176(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%edi), %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 56(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subl %ebp, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 8(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 4(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 40(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 44(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: negl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 160(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%eax,8), %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subl %eax, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 4(%edx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %bl +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ecx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 8(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 12(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ecx, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 16(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 20(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 24(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 28(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 32(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 36(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 40(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 44(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %eax, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 48(%edx), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %esi, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %esi, %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ebp, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: negl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ecx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, (%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 56(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, 60(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 48(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 52(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 40(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 44(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 32(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 36(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%ebx,8), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subl %ebx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 4(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 8(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 40(%eax), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 44(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %ebp, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: negl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 176(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 56(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, 60(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ecx, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %ebx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, 48(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 52(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, 40(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: addl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes: +; X86-NO-SHLD-NO-BMI2-AVX512: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%ecx), %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 4(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 12(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 8(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 20(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 16(%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 28(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 24(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 36(%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 32(%edx), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 44(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 40(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: negl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 176(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%edi), %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 56(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl %ebp, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 8(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 4(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 40(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 44(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: negl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 160(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%eax,8), %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl %eax, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 4(%edx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %bl +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 8(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 12(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 16(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 20(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 24(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 28(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 32(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 36(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 40(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 44(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 48(%edx), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %esi, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %esi, %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: negl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ecx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, (%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 56(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 60(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 48(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 52(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 40(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%ebx,8), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl %ebx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 4(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 8(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 40(%eax), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 44(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %ebp, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: negl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 176(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 56(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 60(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %ebx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 48(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 52(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, 40(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3 @@ -20136,4087 +17857,3115 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou } define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: ashr_64bytes: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %r15 -; FALLBACK0-NEXT: pushq %r14 -; FALLBACK0-NEXT: pushq %r13 -; FALLBACK0-NEXT: pushq %r12 -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rax -; FALLBACK0-NEXT: movq 8(%rdi), %rcx -; FALLBACK0-NEXT: movq 16(%rdi), %r8 -; FALLBACK0-NEXT: movq 24(%rdi), %r9 -; FALLBACK0-NEXT: movq 32(%rdi), %r10 -; FALLBACK0-NEXT: movq 40(%rdi), %r11 -; FALLBACK0-NEXT: movq 48(%rdi), %rbx -; FALLBACK0-NEXT: movq 56(%rdi), %r14 -; FALLBACK0-NEXT: movl (%rsi), %edi -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: sarq $63, %r14 -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: leal (,%rdi,8), %eax -; FALLBACK0-NEXT: andl $56, %eax -; FALLBACK0-NEXT: andl $56, %edi -; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8 -; FALLBACK0-NEXT: movq %r8, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r9 -; FALLBACK0-NEXT: orq %r11, %r9 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %r8, %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r10, %r8 -; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK0-NEXT: movq %r10, %r15 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r15 -; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14 -; FALLBACK0-NEXT: leaq (%r14,%r14), %r11 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: orq %r15, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: addq %r10, %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx -; FALLBACK0-NEXT: movq %rbx, %r12 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r12 -; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13 -; FALLBACK0-NEXT: leaq (%r13,%r13), %r15 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r15 -; FALLBACK0-NEXT: orq %r12, %r15 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r14 -; FALLBACK0-NEXT: addq %rbx, %rbx -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rbx -; FALLBACK0-NEXT: orq %r14, %rbx -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r13 -; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r14 -; FALLBACK0-NEXT: orq %r13, %r14 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: sarq %cl, %rdi -; FALLBACK0-NEXT: movq %rdi, 56(%rdx) -; FALLBACK0-NEXT: movq %r14, 48(%rdx) -; FALLBACK0-NEXT: movq %rbx, 32(%rdx) -; FALLBACK0-NEXT: movq %r15, 40(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) -; FALLBACK0-NEXT: movq %r11, 24(%rdx) -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: popq %r12 -; FALLBACK0-NEXT: popq %r13 -; FALLBACK0-NEXT: popq %r14 -; FALLBACK0-NEXT: popq %r15 -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: ashr_64bytes: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: pushq %r15 -; FALLBACK1-NEXT: pushq %r14 -; FALLBACK1-NEXT: pushq %rbx -; FALLBACK1-NEXT: movq (%rdi), %rcx -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %r10 -; FALLBACK1-NEXT: movq 32(%rdi), %r11 -; FALLBACK1-NEXT: movq 40(%rdi), %rbx -; FALLBACK1-NEXT: movq 48(%rdi), %r14 -; FALLBACK1-NEXT: movq 56(%rdi), %rdi -; FALLBACK1-NEXT: movl (%rsi), %eax -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: sarq $63, %rdi -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: leal (,%rax,8), %ecx -; FALLBACK1-NEXT: andl $56, %ecx -; FALLBACK1-NEXT: andl $56, %eax -; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi -; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9 -; FALLBACK1-NEXT: movq %r9, %r8 -; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8 -; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK1-NEXT: movq %r11, %rbx -; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx -; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11 -; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK1-NEXT: movq %r14, %r15 -; FALLBACK1-NEXT: shrdq %cl, %r11, %r15 -; FALLBACK1-NEXT: shrdq %cl, %r14, %r10 -; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi -; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: sarq %cl, %rax -; FALLBACK1-NEXT: movq %r11, 48(%rdx) -; FALLBACK1-NEXT: movq %rax, 56(%rdx) -; FALLBACK1-NEXT: movq %r10, 32(%rdx) -; FALLBACK1-NEXT: movq %r15, 40(%rdx) -; FALLBACK1-NEXT: movq %rdi, 16(%rdx) -; FALLBACK1-NEXT: movq %rbx, 24(%rdx) -; FALLBACK1-NEXT: movq %rsi, (%rdx) -; FALLBACK1-NEXT: movq %r8, 8(%rdx) -; FALLBACK1-NEXT: popq %rbx -; FALLBACK1-NEXT: popq %r14 -; FALLBACK1-NEXT: popq %r15 -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: ashr_64bytes: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %r15 -; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r12 -; FALLBACK2-NEXT: pushq %rbx -; FALLBACK2-NEXT: pushq %rax -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %r10 -; FALLBACK2-NEXT: movq 32(%rdi), %r11 -; FALLBACK2-NEXT: movq 40(%rdi), %rbx -; FALLBACK2-NEXT: movq 48(%rdi), %r14 -; FALLBACK2-NEXT: movq 56(%rdi), %rdi -; FALLBACK2-NEXT: movl (%rsi), %eax -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: sarq $63, %rdi -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: leal (,%rax,8), %ecx -; FALLBACK2-NEXT: andl $56, %ecx -; FALLBACK2-NEXT: movl %ecx, %esi -; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %r8 -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rsi, %r8, %r9 -; FALLBACK2-NEXT: notb %cl -; FALLBACK2-NEXT: leaq (%r10,%r10), %rdi -; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rdi -; FALLBACK2-NEXT: orq %r9, %rdi -; FALLBACK2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 -; FALLBACK2-NEXT: addq %r8, %r8 -; FALLBACK2-NEXT: shlxq %rcx, %r8, %r8 -; FALLBACK2-NEXT: orq %r9, %r8 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK2-NEXT: shrxq %rsi, %r11, %rbx -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r14 -; FALLBACK2-NEXT: leaq (%r14,%r14), %r9 -; FALLBACK2-NEXT: shlxq %rcx, %r9, %r9 -; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: shrxq %rsi, %r10, %r10 -; FALLBACK2-NEXT: addq %r11, %r11 -; FALLBACK2-NEXT: shlxq %rcx, %r11, %r11 -; FALLBACK2-NEXT: orq %r10, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rsi, %r10, %rbx -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r15 -; FALLBACK2-NEXT: leaq (%r15,%r15), %r12 -; FALLBACK2-NEXT: shlxq %rcx, %r12, %r12 -; FALLBACK2-NEXT: orq %rbx, %r12 -; FALLBACK2-NEXT: shrxq %rsi, %r14, %rbx -; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %rcx, %r10, %r10 -; FALLBACK2-NEXT: orq %rbx, %r10 -; FALLBACK2-NEXT: shrxq %rsi, %r15, %rbx -; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: leaq (%rax,%rax), %r14 -; FALLBACK2-NEXT: shlxq %rcx, %r14, %rcx -; FALLBACK2-NEXT: orq %rbx, %rcx -; FALLBACK2-NEXT: sarxq %rsi, %rax, %rax -; FALLBACK2-NEXT: movq %rax, 56(%rdx) -; FALLBACK2-NEXT: movq %rcx, 48(%rdx) -; FALLBACK2-NEXT: movq %r10, 32(%rdx) -; FALLBACK2-NEXT: movq %r12, 40(%rdx) -; FALLBACK2-NEXT: movq %r11, 16(%rdx) -; FALLBACK2-NEXT: movq %r9, 24(%rdx) -; FALLBACK2-NEXT: movq %r8, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) -; FALLBACK2-NEXT: addq $8, %rsp -; FALLBACK2-NEXT: popq %rbx -; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r14 -; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: ashr_64bytes: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: pushq %r15 -; FALLBACK3-NEXT: pushq %r14 -; FALLBACK3-NEXT: pushq %rbx -; FALLBACK3-NEXT: movq (%rdi), %rcx -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %r10 -; FALLBACK3-NEXT: movq 32(%rdi), %r11 -; FALLBACK3-NEXT: movq 40(%rdi), %rbx -; FALLBACK3-NEXT: movq 48(%rdi), %r14 -; FALLBACK3-NEXT: movq 56(%rdi), %rdi -; FALLBACK3-NEXT: movl (%rsi), %eax -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: sarq $63, %rdi -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: leal (,%rax,8), %ecx -; FALLBACK3-NEXT: andl $56, %ecx -; FALLBACK3-NEXT: andl $56, %eax -; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi -; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9 -; FALLBACK3-NEXT: movq %r9, %r8 -; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8 -; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK3-NEXT: movq %r11, %rbx -; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx -; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11 -; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK3-NEXT: movq %r14, %r15 -; FALLBACK3-NEXT: shrdq %cl, %r11, %r15 -; FALLBACK3-NEXT: shrdq %cl, %r14, %r10 -; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax -; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi -; FALLBACK3-NEXT: movq %r11, 48(%rdx) -; FALLBACK3-NEXT: movq %r10, 32(%rdx) -; FALLBACK3-NEXT: movq %r15, 40(%rdx) -; FALLBACK3-NEXT: movq %rdi, 16(%rdx) -; FALLBACK3-NEXT: movq %rbx, 24(%rdx) -; FALLBACK3-NEXT: movq %rsi, (%rdx) -; FALLBACK3-NEXT: movq %r8, 8(%rdx) -; FALLBACK3-NEXT: movq %rax, 56(%rdx) -; FALLBACK3-NEXT: popq %rbx -; FALLBACK3-NEXT: popq %r14 -; FALLBACK3-NEXT: popq %r15 -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: ashr_64bytes: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %rbp -; FALLBACK4-NEXT: pushq %r15 -; FALLBACK4-NEXT: pushq %r14 -; FALLBACK4-NEXT: pushq %r13 -; FALLBACK4-NEXT: pushq %r12 -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: pushq %rax -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK4-NEXT: movq 48(%rdi), %rax -; FALLBACK4-NEXT: movq 56(%rdi), %rcx -; FALLBACK4-NEXT: movl (%rsi), %edi -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: sarq $63, %rcx -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: leal (,%rdi,8), %eax -; FALLBACK4-NEXT: andl $56, %eax -; FALLBACK4-NEXT: andl $56, %edi -; FALLBACK4-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK4-NEXT: movq -120(%rsp,%rdi), %r9 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r9,%r9), %r8 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r8 -; FALLBACK4-NEXT: orq %r10, %r8 -; FALLBACK4-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK4-NEXT: movq %r10, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbx -; FALLBACK4-NEXT: movq -96(%rsp,%rdi), %r12 -; FALLBACK4-NEXT: leaq (%r12,%r12), %r11 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r11 -; FALLBACK4-NEXT: orq %rbx, %r11 -; FALLBACK4-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK4-NEXT: movq %rbx, %r14 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r14 -; FALLBACK4-NEXT: addq %r10, %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r14, %r10 -; FALLBACK4-NEXT: movq -88(%rsp,%rdi), %r14 -; FALLBACK4-NEXT: movq %r14, %r13 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r13 -; FALLBACK4-NEXT: movq -80(%rsp,%rdi), %rbp -; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r15 -; FALLBACK4-NEXT: orq %r13, %r15 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r12 -; FALLBACK4-NEXT: addq %r14, %r14 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r14 -; FALLBACK4-NEXT: orq %r12, %r14 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbp -; FALLBACK4-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK4-NEXT: leaq (%rdi,%rdi), %r12 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r12 -; FALLBACK4-NEXT: orq %rbp, %r12 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r9 -; FALLBACK4-NEXT: addq %rbx, %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r9, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: sarq %cl, %rdi -; FALLBACK4-NEXT: movq %rdi, 56(%rdx) -; FALLBACK4-NEXT: movq %rbx, 8(%rdx) -; FALLBACK4-NEXT: movq %r12, 48(%rdx) -; FALLBACK4-NEXT: movq %r14, 32(%rdx) -; FALLBACK4-NEXT: movq %r15, 40(%rdx) -; FALLBACK4-NEXT: movq %r10, 16(%rdx) -; FALLBACK4-NEXT: movq %r11, 24(%rdx) -; FALLBACK4-NEXT: movq %r8, (%rdx) -; FALLBACK4-NEXT: addq $8, %rsp -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: popq %r12 -; FALLBACK4-NEXT: popq %r13 -; FALLBACK4-NEXT: popq %r14 -; FALLBACK4-NEXT: popq %r15 -; FALLBACK4-NEXT: popq %rbp -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: ashr_64bytes: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: pushq %r15 -; FALLBACK5-NEXT: pushq %r14 -; FALLBACK5-NEXT: pushq %rbx -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK5-NEXT: movq 48(%rdi), %rcx -; FALLBACK5-NEXT: movq 56(%rdi), %rdi -; FALLBACK5-NEXT: movl (%rsi), %eax -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: sarq $63, %rdi -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: andl $56, %ecx -; FALLBACK5-NEXT: andl $56, %eax -; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq %r9, %rsi -; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK5-NEXT: movq %r10, %r8 -; FALLBACK5-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK5-NEXT: movq %r11, %rbx -; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK5-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK5-NEXT: movq %rax, %r15 -; FALLBACK5-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: sarq %cl, %r11 -; FALLBACK5-NEXT: movq %r15, 8(%rdx) -; FALLBACK5-NEXT: movq %r9, 48(%rdx) -; FALLBACK5-NEXT: movq %r11, 56(%rdx) -; FALLBACK5-NEXT: movq %rdi, 32(%rdx) -; FALLBACK5-NEXT: movq %rbx, 40(%rdx) -; FALLBACK5-NEXT: movq %r8, 16(%rdx) -; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r14, (%rdx) -; FALLBACK5-NEXT: popq %rbx -; FALLBACK5-NEXT: popq %r14 -; FALLBACK5-NEXT: popq %r15 -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: ashr_64bytes: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %r15 -; FALLBACK6-NEXT: pushq %r14 -; FALLBACK6-NEXT: pushq %r13 -; FALLBACK6-NEXT: pushq %r12 -; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK6-NEXT: movq 48(%rdi), %rcx -; FALLBACK6-NEXT: movq 56(%rdi), %rdi -; FALLBACK6-NEXT: movl (%rsi), %eax -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: sarq $63, %rdi -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %ecx -; FALLBACK6-NEXT: andl $56, %ecx -; FALLBACK6-NEXT: movl %ecx, %esi -; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 -; FALLBACK6-NEXT: notb %cl -; FALLBACK6-NEXT: movq -120(%rsp,%rax), %r10 -; FALLBACK6-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK6-NEXT: leaq (%r10,%r10), %rdi -; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rdi -; FALLBACK6-NEXT: orq %r8, %rdi -; FALLBACK6-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK6-NEXT: shrxq %rsi, %r11, %rbx -; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r14 -; FALLBACK6-NEXT: leaq (%r14,%r14), %r8 -; FALLBACK6-NEXT: shlxq %rcx, %r8, %r8 -; FALLBACK6-NEXT: orq %rbx, %r8 -; FALLBACK6-NEXT: shrxq %rsi, %r9, %rbx -; FALLBACK6-NEXT: addq %r11, %r11 -; FALLBACK6-NEXT: shlxq %rcx, %r11, %r11 -; FALLBACK6-NEXT: orq %rbx, %r11 -; FALLBACK6-NEXT: movq -88(%rsp,%rax), %rbx -; FALLBACK6-NEXT: shrxq %rsi, %rbx, %r15 -; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK6-NEXT: leaq (%r12,%r12), %r13 -; FALLBACK6-NEXT: shlxq %rcx, %r13, %r13 -; FALLBACK6-NEXT: orq %r15, %r13 -; FALLBACK6-NEXT: shrxq %rsi, %r14, %r14 -; FALLBACK6-NEXT: addq %rbx, %rbx -; FALLBACK6-NEXT: shlxq %rcx, %rbx, %rbx -; FALLBACK6-NEXT: orq %r14, %rbx -; FALLBACK6-NEXT: shrxq %rsi, %r12, %r14 -; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK6-NEXT: leaq (%rax,%rax), %r15 -; FALLBACK6-NEXT: shlxq %rcx, %r15, %r15 -; FALLBACK6-NEXT: orq %r14, %r15 -; FALLBACK6-NEXT: shrxq %rsi, %r10, %r10 -; FALLBACK6-NEXT: addq %r9, %r9 -; FALLBACK6-NEXT: shlxq %rcx, %r9, %rcx -; FALLBACK6-NEXT: orq %r10, %rcx -; FALLBACK6-NEXT: sarxq %rsi, %rax, %rax -; FALLBACK6-NEXT: movq %rax, 56(%rdx) -; FALLBACK6-NEXT: movq %rcx, 8(%rdx) -; FALLBACK6-NEXT: movq %r15, 48(%rdx) -; FALLBACK6-NEXT: movq %rbx, 32(%rdx) -; FALLBACK6-NEXT: movq %r13, 40(%rdx) -; FALLBACK6-NEXT: movq %r11, 16(%rdx) -; FALLBACK6-NEXT: movq %r8, 24(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) -; FALLBACK6-NEXT: popq %rbx -; FALLBACK6-NEXT: popq %r12 -; FALLBACK6-NEXT: popq %r13 -; FALLBACK6-NEXT: popq %r14 -; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: ashr_64bytes: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: pushq %r15 -; FALLBACK7-NEXT: pushq %r14 -; FALLBACK7-NEXT: pushq %rbx -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK7-NEXT: movq 48(%rdi), %rcx -; FALLBACK7-NEXT: movq 56(%rdi), %rdi -; FALLBACK7-NEXT: movl (%rsi), %eax -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: sarq $63, %rdi -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: andl $56, %ecx -; FALLBACK7-NEXT: andl $56, %eax -; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq %r9, %rsi -; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK7-NEXT: movq %r10, %r8 -; FALLBACK7-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK7-NEXT: movq %r11, %rbx -; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK7-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK7-NEXT: movq %rax, %r15 -; FALLBACK7-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK7-NEXT: sarxq %rcx, %r11, %r10 -; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK7-NEXT: movq %r15, 8(%rdx) -; FALLBACK7-NEXT: movq %r9, 48(%rdx) -; FALLBACK7-NEXT: movq %rdi, 32(%rdx) -; FALLBACK7-NEXT: movq %rbx, 40(%rdx) -; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rsi, 24(%rdx) -; FALLBACK7-NEXT: movq %r14, (%rdx) -; FALLBACK7-NEXT: movq %r10, 56(%rdx) -; FALLBACK7-NEXT: popq %rbx -; FALLBACK7-NEXT: popq %r14 -; FALLBACK7-NEXT: popq %r15 -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: ashr_64bytes: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %rbp -; FALLBACK8-NEXT: pushq %r15 -; FALLBACK8-NEXT: pushq %r14 -; FALLBACK8-NEXT: pushq %r13 -; FALLBACK8-NEXT: pushq %r12 -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: pushq %rax -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK8-NEXT: movq 48(%rdi), %rax -; FALLBACK8-NEXT: movq 56(%rdi), %rcx -; FALLBACK8-NEXT: movl (%rsi), %edi -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: sarq $63, %rcx -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: leal (,%rdi,8), %eax -; FALLBACK8-NEXT: andl $56, %eax -; FALLBACK8-NEXT: andl $56, %edi -; FALLBACK8-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK8-NEXT: movq -120(%rsp,%rdi), %r9 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r9,%r9), %r8 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r8 -; FALLBACK8-NEXT: orq %r10, %r8 -; FALLBACK8-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK8-NEXT: movq %r10, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbx -; FALLBACK8-NEXT: movq -96(%rsp,%rdi), %r12 -; FALLBACK8-NEXT: leaq (%r12,%r12), %r11 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r11 -; FALLBACK8-NEXT: orq %rbx, %r11 -; FALLBACK8-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK8-NEXT: movq %rbx, %r14 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r14 -; FALLBACK8-NEXT: addq %r10, %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r14, %r10 -; FALLBACK8-NEXT: movq -88(%rsp,%rdi), %r14 -; FALLBACK8-NEXT: movq %r14, %r13 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r13 -; FALLBACK8-NEXT: movq -80(%rsp,%rdi), %rbp -; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r15 -; FALLBACK8-NEXT: orq %r13, %r15 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r12 -; FALLBACK8-NEXT: addq %r14, %r14 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r14 -; FALLBACK8-NEXT: orq %r12, %r14 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbp -; FALLBACK8-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK8-NEXT: leaq (%rdi,%rdi), %r12 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r12 -; FALLBACK8-NEXT: orq %rbp, %r12 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r9 -; FALLBACK8-NEXT: addq %rbx, %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r9, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: sarq %cl, %rdi -; FALLBACK8-NEXT: movq %rdi, 56(%rdx) -; FALLBACK8-NEXT: movq %rbx, 8(%rdx) -; FALLBACK8-NEXT: movq %r12, 48(%rdx) -; FALLBACK8-NEXT: movq %r14, 32(%rdx) -; FALLBACK8-NEXT: movq %r15, 40(%rdx) -; FALLBACK8-NEXT: movq %r10, 16(%rdx) -; FALLBACK8-NEXT: movq %r11, 24(%rdx) -; FALLBACK8-NEXT: movq %r8, (%rdx) -; FALLBACK8-NEXT: addq $8, %rsp -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: popq %r12 -; FALLBACK8-NEXT: popq %r13 -; FALLBACK8-NEXT: popq %r14 -; FALLBACK8-NEXT: popq %r15 -; FALLBACK8-NEXT: popq %rbp -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: ashr_64bytes: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: pushq %r15 -; FALLBACK9-NEXT: pushq %r14 -; FALLBACK9-NEXT: pushq %rbx -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK9-NEXT: movq 48(%rdi), %rcx -; FALLBACK9-NEXT: movq 56(%rdi), %rdi -; FALLBACK9-NEXT: movl (%rsi), %eax -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: sarq $63, %rdi -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: andl $56, %ecx -; FALLBACK9-NEXT: andl $56, %eax -; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq %r9, %rsi -; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK9-NEXT: movq %r10, %r8 -; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK9-NEXT: movq %r11, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r15 -; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: sarq %cl, %r11 -; FALLBACK9-NEXT: movq %r15, 8(%rdx) -; FALLBACK9-NEXT: movq %r9, 48(%rdx) -; FALLBACK9-NEXT: movq %r11, 56(%rdx) -; FALLBACK9-NEXT: movq %rdi, 32(%rdx) -; FALLBACK9-NEXT: movq %rbx, 40(%rdx) -; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r14, (%rdx) -; FALLBACK9-NEXT: popq %rbx -; FALLBACK9-NEXT: popq %r14 -; FALLBACK9-NEXT: popq %r15 -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: ashr_64bytes: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %r15 -; FALLBACK10-NEXT: pushq %r14 -; FALLBACK10-NEXT: pushq %r13 -; FALLBACK10-NEXT: pushq %r12 -; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK10-NEXT: movq 48(%rdi), %rcx -; FALLBACK10-NEXT: movq 56(%rdi), %rdi -; FALLBACK10-NEXT: movl (%rsi), %eax -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: sarq $63, %rdi -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %ecx -; FALLBACK10-NEXT: andl $56, %ecx -; FALLBACK10-NEXT: movl %ecx, %esi -; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 -; FALLBACK10-NEXT: notb %cl -; FALLBACK10-NEXT: movq -120(%rsp,%rax), %r10 -; FALLBACK10-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK10-NEXT: leaq (%r10,%r10), %rdi -; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rdi -; FALLBACK10-NEXT: orq %r8, %rdi -; FALLBACK10-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK10-NEXT: shrxq %rsi, %r11, %rbx -; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r14 -; FALLBACK10-NEXT: leaq (%r14,%r14), %r8 -; FALLBACK10-NEXT: shlxq %rcx, %r8, %r8 -; FALLBACK10-NEXT: orq %rbx, %r8 -; FALLBACK10-NEXT: shrxq %rsi, %r9, %rbx -; FALLBACK10-NEXT: addq %r11, %r11 -; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 -; FALLBACK10-NEXT: orq %rbx, %r11 -; FALLBACK10-NEXT: movq -88(%rsp,%rax), %rbx -; FALLBACK10-NEXT: shrxq %rsi, %rbx, %r15 -; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK10-NEXT: leaq (%r12,%r12), %r13 -; FALLBACK10-NEXT: shlxq %rcx, %r13, %r13 -; FALLBACK10-NEXT: orq %r15, %r13 -; FALLBACK10-NEXT: shrxq %rsi, %r14, %r14 -; FALLBACK10-NEXT: addq %rbx, %rbx -; FALLBACK10-NEXT: shlxq %rcx, %rbx, %rbx -; FALLBACK10-NEXT: orq %r14, %rbx -; FALLBACK10-NEXT: shrxq %rsi, %r12, %r14 -; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK10-NEXT: leaq (%rax,%rax), %r15 -; FALLBACK10-NEXT: shlxq %rcx, %r15, %r15 -; FALLBACK10-NEXT: orq %r14, %r15 -; FALLBACK10-NEXT: shrxq %rsi, %r10, %r10 -; FALLBACK10-NEXT: addq %r9, %r9 -; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx -; FALLBACK10-NEXT: orq %r10, %rcx -; FALLBACK10-NEXT: sarxq %rsi, %rax, %rax -; FALLBACK10-NEXT: movq %rax, 56(%rdx) -; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %r15, 48(%rdx) -; FALLBACK10-NEXT: movq %rbx, 32(%rdx) -; FALLBACK10-NEXT: movq %r13, 40(%rdx) -; FALLBACK10-NEXT: movq %r11, 16(%rdx) -; FALLBACK10-NEXT: movq %r8, 24(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) -; FALLBACK10-NEXT: popq %rbx -; FALLBACK10-NEXT: popq %r12 -; FALLBACK10-NEXT: popq %r13 -; FALLBACK10-NEXT: popq %r14 -; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: ashr_64bytes: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: pushq %r15 -; FALLBACK11-NEXT: pushq %r14 -; FALLBACK11-NEXT: pushq %rbx -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK11-NEXT: movq 48(%rdi), %rcx -; FALLBACK11-NEXT: movq 56(%rdi), %rdi -; FALLBACK11-NEXT: movl (%rsi), %eax -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: sarq $63, %rdi -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: andl $56, %ecx -; FALLBACK11-NEXT: andl $56, %eax -; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq %r9, %rsi -; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK11-NEXT: movq %r10, %r8 -; FALLBACK11-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK11-NEXT: movq %r11, %rbx -; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK11-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r15 -; FALLBACK11-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK11-NEXT: sarxq %rcx, %r11, %r10 -; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK11-NEXT: movq %r15, 8(%rdx) -; FALLBACK11-NEXT: movq %r9, 48(%rdx) -; FALLBACK11-NEXT: movq %rdi, 32(%rdx) -; FALLBACK11-NEXT: movq %rbx, 40(%rdx) -; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rsi, 24(%rdx) -; FALLBACK11-NEXT: movq %r14, (%rdx) -; FALLBACK11-NEXT: movq %r10, 56(%rdx) -; FALLBACK11-NEXT: popq %rbx -; FALLBACK11-NEXT: popq %r14 -; FALLBACK11-NEXT: popq %r15 -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: ashr_64bytes: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %rbp -; FALLBACK12-NEXT: pushq %r15 -; FALLBACK12-NEXT: pushq %r14 -; FALLBACK12-NEXT: pushq %r13 -; FALLBACK12-NEXT: pushq %r12 -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: pushq %rax -; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK12-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK12-NEXT: movq 48(%rdi), %rax -; FALLBACK12-NEXT: movq 56(%rdi), %rcx -; FALLBACK12-NEXT: movl (%rsi), %edi -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: sarq $63, %rcx -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: leal (,%rdi,8), %eax -; FALLBACK12-NEXT: andl $56, %eax -; FALLBACK12-NEXT: andl $56, %edi -; FALLBACK12-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK12-NEXT: movq -120(%rsp,%rdi), %r9 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r9,%r9), %r8 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r8 -; FALLBACK12-NEXT: orq %r10, %r8 -; FALLBACK12-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK12-NEXT: movq %r10, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: movq -96(%rsp,%rdi), %r12 -; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r11 -; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK12-NEXT: movq %rbx, %r14 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r14 -; FALLBACK12-NEXT: addq %r10, %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r14, %r10 -; FALLBACK12-NEXT: movq -88(%rsp,%rdi), %r14 -; FALLBACK12-NEXT: movq %r14, %r13 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r13 -; FALLBACK12-NEXT: movq -80(%rsp,%rdi), %rbp -; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r15 -; FALLBACK12-NEXT: orq %r13, %r15 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r12 -; FALLBACK12-NEXT: addq %r14, %r14 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r14 -; FALLBACK12-NEXT: orq %r12, %r14 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbp -; FALLBACK12-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK12-NEXT: leaq (%rdi,%rdi), %r12 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r12 -; FALLBACK12-NEXT: orq %rbp, %r12 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: addq %rbx, %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r9, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: sarq %cl, %rdi -; FALLBACK12-NEXT: movq %rdi, 56(%rdx) -; FALLBACK12-NEXT: movq %rbx, 8(%rdx) -; FALLBACK12-NEXT: movq %r12, 48(%rdx) -; FALLBACK12-NEXT: movq %r14, 32(%rdx) -; FALLBACK12-NEXT: movq %r15, 40(%rdx) -; FALLBACK12-NEXT: movq %r10, 16(%rdx) -; FALLBACK12-NEXT: movq %r11, 24(%rdx) -; FALLBACK12-NEXT: movq %r8, (%rdx) -; FALLBACK12-NEXT: addq $8, %rsp -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: popq %r12 -; FALLBACK12-NEXT: popq %r13 -; FALLBACK12-NEXT: popq %r14 -; FALLBACK12-NEXT: popq %r15 -; FALLBACK12-NEXT: popq %rbp -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: ashr_64bytes: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: pushq %r15 -; FALLBACK13-NEXT: pushq %r14 -; FALLBACK13-NEXT: pushq %rbx -; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK13-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK13-NEXT: movq 48(%rdi), %rcx -; FALLBACK13-NEXT: movq 56(%rdi), %rdi -; FALLBACK13-NEXT: movl (%rsi), %eax -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: sarq $63, %rdi -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: leal (,%rax,8), %ecx -; FALLBACK13-NEXT: andl $56, %ecx -; FALLBACK13-NEXT: andl $56, %eax -; FALLBACK13-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK13-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK13-NEXT: movq %r9, %rsi -; FALLBACK13-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK13-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK13-NEXT: movq %r10, %r8 -; FALLBACK13-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK13-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK13-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK13-NEXT: movq %r11, %rbx -; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK13-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK13-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK13-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK13-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK13-NEXT: movq %rax, %r15 -; FALLBACK13-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: sarq %cl, %r11 -; FALLBACK13-NEXT: movq %r15, 8(%rdx) -; FALLBACK13-NEXT: movq %r9, 48(%rdx) -; FALLBACK13-NEXT: movq %r11, 56(%rdx) -; FALLBACK13-NEXT: movq %rdi, 32(%rdx) -; FALLBACK13-NEXT: movq %rbx, 40(%rdx) -; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r14, (%rdx) -; FALLBACK13-NEXT: popq %rbx -; FALLBACK13-NEXT: popq %r14 -; FALLBACK13-NEXT: popq %r15 -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: ashr_64bytes: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %r15 -; FALLBACK14-NEXT: pushq %r14 -; FALLBACK14-NEXT: pushq %r13 -; FALLBACK14-NEXT: pushq %r12 -; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK14-NEXT: movq 48(%rdi), %rcx -; FALLBACK14-NEXT: movq 56(%rdi), %rdi -; FALLBACK14-NEXT: movl (%rsi), %eax -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: sarq $63, %rdi -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rax,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx -; FALLBACK14-NEXT: movl %ecx, %esi -; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 -; FALLBACK14-NEXT: notb %cl -; FALLBACK14-NEXT: movq -120(%rsp,%rax), %r10 -; FALLBACK14-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK14-NEXT: leaq (%r10,%r10), %rdi -; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rdi -; FALLBACK14-NEXT: orq %r8, %rdi -; FALLBACK14-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK14-NEXT: shrxq %rsi, %r11, %rbx -; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r14 -; FALLBACK14-NEXT: leaq (%r14,%r14), %r8 -; FALLBACK14-NEXT: shlxq %rcx, %r8, %r8 -; FALLBACK14-NEXT: orq %rbx, %r8 -; FALLBACK14-NEXT: shrxq %rsi, %r9, %rbx -; FALLBACK14-NEXT: addq %r11, %r11 -; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 -; FALLBACK14-NEXT: orq %rbx, %r11 -; FALLBACK14-NEXT: movq -88(%rsp,%rax), %rbx -; FALLBACK14-NEXT: shrxq %rsi, %rbx, %r15 -; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK14-NEXT: leaq (%r12,%r12), %r13 -; FALLBACK14-NEXT: shlxq %rcx, %r13, %r13 -; FALLBACK14-NEXT: orq %r15, %r13 -; FALLBACK14-NEXT: shrxq %rsi, %r14, %r14 -; FALLBACK14-NEXT: addq %rbx, %rbx -; FALLBACK14-NEXT: shlxq %rcx, %rbx, %rbx -; FALLBACK14-NEXT: orq %r14, %rbx -; FALLBACK14-NEXT: shrxq %rsi, %r12, %r14 -; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK14-NEXT: leaq (%rax,%rax), %r15 -; FALLBACK14-NEXT: shlxq %rcx, %r15, %r15 -; FALLBACK14-NEXT: orq %r14, %r15 -; FALLBACK14-NEXT: shrxq %rsi, %r10, %r10 -; FALLBACK14-NEXT: addq %r9, %r9 -; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx -; FALLBACK14-NEXT: orq %r10, %rcx -; FALLBACK14-NEXT: sarxq %rsi, %rax, %rax -; FALLBACK14-NEXT: movq %rax, 56(%rdx) -; FALLBACK14-NEXT: movq %rcx, 8(%rdx) -; FALLBACK14-NEXT: movq %r15, 48(%rdx) -; FALLBACK14-NEXT: movq %rbx, 32(%rdx) -; FALLBACK14-NEXT: movq %r13, 40(%rdx) -; FALLBACK14-NEXT: movq %r11, 16(%rdx) -; FALLBACK14-NEXT: movq %r8, 24(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) -; FALLBACK14-NEXT: popq %rbx -; FALLBACK14-NEXT: popq %r12 -; FALLBACK14-NEXT: popq %r13 -; FALLBACK14-NEXT: popq %r14 -; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: ashr_64bytes: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: pushq %r15 -; FALLBACK15-NEXT: pushq %r14 -; FALLBACK15-NEXT: pushq %rbx -; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK15-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK15-NEXT: movq 48(%rdi), %rcx -; FALLBACK15-NEXT: movq 56(%rdi), %rdi -; FALLBACK15-NEXT: movl (%rsi), %eax -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: sarq $63, %rdi -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: andl $56, %ecx -; FALLBACK15-NEXT: andl $56, %eax -; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq %r9, %rsi -; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK15-NEXT: movq %r10, %r8 -; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK15-NEXT: movq %r11, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r15 -; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK15-NEXT: sarxq %rcx, %r11, %r10 -; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK15-NEXT: movq %r15, 8(%rdx) -; FALLBACK15-NEXT: movq %r9, 48(%rdx) -; FALLBACK15-NEXT: movq %rdi, 32(%rdx) -; FALLBACK15-NEXT: movq %rbx, 40(%rdx) -; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rsi, 24(%rdx) -; FALLBACK15-NEXT: movq %r14, (%rdx) -; FALLBACK15-NEXT: movq %r10, 56(%rdx) -; FALLBACK15-NEXT: popq %rbx -; FALLBACK15-NEXT: popq %r14 -; FALLBACK15-NEXT: popq %r15 -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq -; -; FALLBACK16-LABEL: ashr_64bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $204, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl (%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 16(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 20(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 24(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 28(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 40(%ecx), %ebx -; FALLBACK16-NEXT: movl 44(%ecx), %edi -; FALLBACK16-NEXT: movl 48(%ecx), %esi -; FALLBACK16-NEXT: movl 52(%ecx), %edx -; FALLBACK16-NEXT: movl 56(%ecx), %eax -; FALLBACK16-NEXT: movl 60(%ecx), %ecx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK16-NEXT: movl (%ebp), %ebp -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: sarl $31, %ecx -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, %ecx -; FALLBACK16-NEXT: movl %ebp, %esi -; FALLBACK16-NEXT: andl $60, %esi -; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK16-NEXT: shll $3, %ecx -; FALLBACK16-NEXT: andl $24, %ecx -; FALLBACK16-NEXT: movl %edx, %eax -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 72(%esp,%esi), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK16-NEXT: movl %ecx, %ebx -; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK16-NEXT: notb %ch -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %eax, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 64(%esp,%esi), %eax -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %edx, %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %eax, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 76(%esp,%esi), %ebp -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %edx, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %ebp, %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: movl 84(%esp,%esi), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 88(%esp,%esi), %esi -; FALLBACK16-NEXT: leal (%esi,%esi), %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: addl %ebx, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %edx, %eax -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 92(%esp,%edx), %ebp -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: movl 96(%esp,%eax), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %edx, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: addl %ebp, %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %esi, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl 100(%esp,%edx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 104(%esp,%edx), %esi -; FALLBACK16-NEXT: leal (%esi,%esi), %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebx, %edx -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: addl %ebx, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK16-NEXT: movl 108(%esp,%ebp), %edi -; FALLBACK16-NEXT: movl %edi, %eax -; FALLBACK16-NEXT: movl %edx, %ebx -; FALLBACK16-NEXT: movl %ebx, %ecx -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 112(%esp,%ebp), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl 116(%esp,%edx), %esi -; FALLBACK16-NEXT: movl %esi, %eax -; FALLBACK16-NEXT: movl %ebx, %ecx -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 120(%esp,%edx), %edx -; FALLBACK16-NEXT: leal (%edx,%edx), %ebp -; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %esi, %esi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %eax, %esi -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: movl %edx, %eax -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %eax, %edx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK16-NEXT: sarl %cl, %ebx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %ebx, 60(%eax) -; FALLBACK16-NEXT: movl %edx, 56(%eax) -; FALLBACK16-NEXT: movl %esi, 48(%eax) -; FALLBACK16-NEXT: movl %ebp, 52(%eax) -; FALLBACK16-NEXT: movl %edi, 40(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 44(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 32(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 36(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 24(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 28(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 16(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 20(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 8(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 12(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, (%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $204, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: ashr_64bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $188, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl (%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 12(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 16(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 20(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 24(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 28(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 36(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%eax), %ebp -; FALLBACK17-NEXT: movl 44(%eax), %ebx -; FALLBACK17-NEXT: movl 48(%eax), %edi -; FALLBACK17-NEXT: movl 52(%eax), %esi -; FALLBACK17-NEXT: movl 56(%eax), %edx -; FALLBACK17-NEXT: movl 60(%eax), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %ecx -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: sarl $31, %eax -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ecx, %ebp -; FALLBACK17-NEXT: andl $60, %ebp -; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shll $3, %ecx -; FALLBACK17-NEXT: andl $24, %ecx -; FALLBACK17-NEXT: shrdl %cl, %edx, %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %esi -; FALLBACK17-NEXT: shrdl %cl, %edi, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %edi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl %esi, %edx -; FALLBACK17-NEXT: shrdl %cl, %eax, %edi -; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %esi, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %edx, %edi -; FALLBACK17-NEXT: shrdl %cl, %eax, %esi -; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 56(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK17-NEXT: sarl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 60(%ebp) -; FALLBACK17-NEXT: movl %esi, 48(%ebp) -; FALLBACK17-NEXT: movl %edi, 52(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 40(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 44(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 32(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 36(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 24(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 16(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %ebx, (%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) -; FALLBACK17-NEXT: addl $188, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: ashr_64bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $204, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 12(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 20(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 28(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 36(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ebp -; FALLBACK18-NEXT: movl 44(%eax), %ebx -; FALLBACK18-NEXT: movl 48(%eax), %edi -; FALLBACK18-NEXT: movl 52(%eax), %esi -; FALLBACK18-NEXT: movl 56(%eax), %edx -; FALLBACK18-NEXT: movl 60(%eax), %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %eax -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: sarl $31, %ecx -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, %ecx -; FALLBACK18-NEXT: leal (,%eax,8), %edx -; FALLBACK18-NEXT: andl $24, %edx -; FALLBACK18-NEXT: movl %edx, %ebx -; FALLBACK18-NEXT: andl $60, %ecx -; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%edi,%edi), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %eax -; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %eax -; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %eax -; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %eax -; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %ecx, %ebp -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %edi -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: orl %ecx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 120(%esp,%ebp), %edi -; FALLBACK18-NEXT: leal (%edi,%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK18-NEXT: movl 116(%esp,%ebp), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %ebp -; FALLBACK18-NEXT: orl %ebp, %esi -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK18-NEXT: orl %ebp, %ecx -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl 124(%esp,%eax), %eax -; FALLBACK18-NEXT: leal (%eax,%eax), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %edx -; FALLBACK18-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK18-NEXT: orl %edi, %edx -; FALLBACK18-NEXT: sarxl %ebx, %eax, %edi -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %edi, 60(%eax) -; FALLBACK18-NEXT: movl %edx, 56(%eax) -; FALLBACK18-NEXT: movl %ecx, 48(%eax) -; FALLBACK18-NEXT: movl %esi, 52(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 40(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 44(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 32(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 36(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 24(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 28(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) -; FALLBACK18-NEXT: addl $204, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: ashr_64bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $188, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: movl (%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 12(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 16(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 20(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 24(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 28(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 36(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%eax), %ebp -; FALLBACK19-NEXT: movl 44(%eax), %ebx -; FALLBACK19-NEXT: movl 48(%eax), %edi -; FALLBACK19-NEXT: movl 52(%eax), %esi -; FALLBACK19-NEXT: movl 56(%eax), %edx -; FALLBACK19-NEXT: movl 60(%eax), %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %ecx -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: sarl $31, %eax -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ecx, %ebp -; FALLBACK19-NEXT: andl $60, %ebp -; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shll $3, %ecx -; FALLBACK19-NEXT: andl $24, %ecx -; FALLBACK19-NEXT: shrdl %cl, %edx, %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %esi -; FALLBACK19-NEXT: shrdl %cl, %edi, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %esi, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %edi, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %edi -; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %esi, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl %edi, %edx -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %edi, %esi -; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl %eax, 56(%ebp) -; FALLBACK19-NEXT: movl %esi, 48(%ebp) -; FALLBACK19-NEXT: movl %edx, 52(%ebp) -; FALLBACK19-NEXT: movl %ebx, 40(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 44(%ebp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 32(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 36(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 24(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 28(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 16(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 20(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 8(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: shrdl %cl, %edx, %edi -; FALLBACK19-NEXT: movl %edi, (%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 4(%ebp) -; FALLBACK19-NEXT: movl %eax, 60(%ebp) -; FALLBACK19-NEXT: addl $188, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: ashr_64bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $204, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK20-NEXT: movl 48(%ecx), %edx -; FALLBACK20-NEXT: movl 52(%ecx), %esi -; FALLBACK20-NEXT: movl 56(%ecx), %edi -; FALLBACK20-NEXT: movl 60(%ecx), %ecx -; FALLBACK20-NEXT: movl (%eax), %eax -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: sarl $31, %ecx -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %eax, %esi -; FALLBACK20-NEXT: andl $60, %esi -; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK20-NEXT: shll $3, %eax -; FALLBACK20-NEXT: andl $24, %eax -; FALLBACK20-NEXT: movl %edx, %edi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movb %al, %ch -; FALLBACK20-NEXT: notb %ch -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %edx, %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %edi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK20-NEXT: movl %edx, %ebp -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %edx, %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: addl %eax, %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK20-NEXT: leal (%edx,%edx), %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK20-NEXT: movl %edi, %ebp -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: addl %edi, %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %edx, %edi -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK20-NEXT: movl %esi, %ebx -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK20-NEXT: leal (%eax,%eax), %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %eax, %edx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK20-NEXT: sarl %cl, %ebx -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl %ebx, 60(%eax) -; FALLBACK20-NEXT: movl %edx, 56(%eax) -; FALLBACK20-NEXT: movl %esi, 48(%eax) -; FALLBACK20-NEXT: movl %ebp, 52(%eax) -; FALLBACK20-NEXT: movl %edi, 40(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 44(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 32(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 36(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 24(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 28(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 16(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 20(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 8(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 12(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, (%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 4(%eax) -; FALLBACK20-NEXT: addl $204, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: ashr_64bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $188, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movups (%eax), %xmm0 -; FALLBACK21-NEXT: movups 16(%eax), %xmm1 -; FALLBACK21-NEXT: movups 32(%eax), %xmm2 -; FALLBACK21-NEXT: movl 48(%eax), %edx -; FALLBACK21-NEXT: movl 52(%eax), %esi -; FALLBACK21-NEXT: movl 56(%eax), %edi -; FALLBACK21-NEXT: movl 60(%eax), %eax -; FALLBACK21-NEXT: movl (%ecx), %ecx -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: sarl $31, %eax -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ecx, %ebp -; FALLBACK21-NEXT: andl $60, %ebp -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shll $3, %ecx -; FALLBACK21-NEXT: andl $24, %ecx -; FALLBACK21-NEXT: shrdl %cl, %edx, %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %esi -; FALLBACK21-NEXT: shrdl %cl, %edi, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %edi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl %esi, %edx -; FALLBACK21-NEXT: shrdl %cl, %eax, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %esi, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %edx, %edi -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %edx, 56(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK21-NEXT: sarl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 60(%ebp) -; FALLBACK21-NEXT: movl %esi, 48(%ebp) -; FALLBACK21-NEXT: movl %edi, 52(%ebp) -; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 40(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 44(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 32(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 36(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 24(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %ebx, (%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 4(%ebp) -; FALLBACK21-NEXT: addl $188, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: ashr_64bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $204, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK22-NEXT: movl 48(%ecx), %edx -; FALLBACK22-NEXT: movl 52(%ecx), %esi -; FALLBACK22-NEXT: movl 56(%ecx), %edi -; FALLBACK22-NEXT: movl 60(%ecx), %ecx -; FALLBACK22-NEXT: movl (%eax), %eax -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: sarl $31, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: leal (,%eax,8), %edx -; FALLBACK22-NEXT: andl $24, %edx -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: andl $60, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: notb %dl -; FALLBACK22-NEXT: leal (%edi,%edi), %ebp -; FALLBACK22-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %eax -; FALLBACK22-NEXT: orl %edi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %eax -; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %edi -; FALLBACK22-NEXT: orl %eax, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %eax -; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %eax -; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %edi -; FALLBACK22-NEXT: orl %eax, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %eax -; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %ebp -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %eax -; FALLBACK22-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %edi -; FALLBACK22-NEXT: orl %edi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %eax -; FALLBACK22-NEXT: orl %ecx, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 120(%esp,%ebp), %edi -; FALLBACK22-NEXT: leal (%edi,%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK22-NEXT: movl 116(%esp,%ebp), %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %ebp -; FALLBACK22-NEXT: orl %ebp, %esi -; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK22-NEXT: orl %ebp, %ecx -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl 124(%esp,%eax), %eax -; FALLBACK22-NEXT: leal (%eax,%eax), %ebp -; FALLBACK22-NEXT: shlxl %edx, %ebp, %edx -; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK22-NEXT: orl %edi, %edx -; FALLBACK22-NEXT: sarxl %ebx, %eax, %edi -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edi, 60(%eax) -; FALLBACK22-NEXT: movl %edx, 56(%eax) -; FALLBACK22-NEXT: movl %ecx, 48(%eax) -; FALLBACK22-NEXT: movl %esi, 52(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 40(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 44(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 32(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 36(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 24(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 28(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 16(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 20(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 8(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 12(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, (%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 4(%eax) -; FALLBACK22-NEXT: addl $204, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: ashr_64bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $188, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movups (%eax), %xmm0 -; FALLBACK23-NEXT: movups 16(%eax), %xmm1 -; FALLBACK23-NEXT: movups 32(%eax), %xmm2 -; FALLBACK23-NEXT: movl 48(%eax), %edx -; FALLBACK23-NEXT: movl 52(%eax), %esi -; FALLBACK23-NEXT: movl 56(%eax), %edi -; FALLBACK23-NEXT: movl 60(%eax), %eax -; FALLBACK23-NEXT: movl (%ecx), %ecx -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: sarl $31, %eax -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %ecx, %ebp -; FALLBACK23-NEXT: andl $60, %ebp -; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shll $3, %ecx -; FALLBACK23-NEXT: andl $24, %ecx -; FALLBACK23-NEXT: shrdl %cl, %edx, %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %esi -; FALLBACK23-NEXT: shrdl %cl, %edi, %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %edi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl %edi, %edx -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %edi, %esi -; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK23-NEXT: movl %eax, 56(%ebp) -; FALLBACK23-NEXT: movl %esi, 48(%ebp) -; FALLBACK23-NEXT: movl %edx, 52(%ebp) -; FALLBACK23-NEXT: movl %ebx, 40(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 44(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 32(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 36(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 24(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 28(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 16(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 20(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 8(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 12(%ebp) -; FALLBACK23-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %edx, %edi -; FALLBACK23-NEXT: movl %edi, (%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 4(%ebp) -; FALLBACK23-NEXT: movl %eax, 60(%ebp) -; FALLBACK23-NEXT: addl $188, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: ashr_64bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $204, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK24-NEXT: vmovups 32(%ecx), %xmm1 -; FALLBACK24-NEXT: movl 48(%ecx), %edx -; FALLBACK24-NEXT: movl 52(%ecx), %esi -; FALLBACK24-NEXT: movl 56(%ecx), %edi -; FALLBACK24-NEXT: movl 60(%ecx), %ecx -; FALLBACK24-NEXT: movl (%eax), %eax -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: sarl $31, %ecx -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %eax, %esi -; FALLBACK24-NEXT: andl $60, %esi -; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK24-NEXT: shll $3, %eax -; FALLBACK24-NEXT: andl $24, %eax -; FALLBACK24-NEXT: movl %edx, %edi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movb %al, %ch -; FALLBACK24-NEXT: notb %ch -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %edx, %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %edi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK24-NEXT: movl %edx, %ebp -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %edx, %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: addl %eax, %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK24-NEXT: leal (%edx,%edx), %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK24-NEXT: movl %edi, %ebp -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: addl %edi, %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %edx, %edi -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK24-NEXT: movl %esi, %ebx -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK24-NEXT: leal (%eax,%eax), %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %eax, %edx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK24-NEXT: sarl %cl, %ebx -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl %ebx, 60(%eax) -; FALLBACK24-NEXT: movl %edx, 56(%eax) -; FALLBACK24-NEXT: movl %esi, 48(%eax) -; FALLBACK24-NEXT: movl %ebp, 52(%eax) -; FALLBACK24-NEXT: movl %edi, 40(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 44(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 32(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 36(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 24(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 28(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 16(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 20(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 8(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 12(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, (%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 4(%eax) -; FALLBACK24-NEXT: addl $204, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: vzeroupper -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: ashr_64bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $188, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: vmovups (%eax), %ymm0 -; FALLBACK25-NEXT: vmovups 32(%eax), %xmm1 -; FALLBACK25-NEXT: movl 48(%eax), %edx -; FALLBACK25-NEXT: movl 52(%eax), %esi -; FALLBACK25-NEXT: movl 56(%eax), %edi -; FALLBACK25-NEXT: movl 60(%eax), %eax -; FALLBACK25-NEXT: movl (%ecx), %ecx -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: sarl $31, %eax -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ecx, %ebp -; FALLBACK25-NEXT: andl $60, %ebp -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shll $3, %ecx -; FALLBACK25-NEXT: andl $24, %ecx -; FALLBACK25-NEXT: shrdl %cl, %edx, %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %esi -; FALLBACK25-NEXT: shrdl %cl, %edi, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %edi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl %esi, %edx -; FALLBACK25-NEXT: shrdl %cl, %eax, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %esi, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %edx, %edi -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %edx, 56(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK25-NEXT: sarl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 60(%ebp) -; FALLBACK25-NEXT: movl %esi, 48(%ebp) -; FALLBACK25-NEXT: movl %edi, 52(%ebp) -; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 40(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 44(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 32(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 36(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 24(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %ebx, (%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 4(%ebp) -; FALLBACK25-NEXT: addl $188, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: vzeroupper -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: ashr_64bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $204, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: vmovups 32(%ecx), %xmm1 -; FALLBACK26-NEXT: movl 48(%ecx), %edx -; FALLBACK26-NEXT: movl 52(%ecx), %esi -; FALLBACK26-NEXT: movl 56(%ecx), %edi -; FALLBACK26-NEXT: movl 60(%ecx), %ecx -; FALLBACK26-NEXT: movl (%eax), %eax -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: sarl $31, %ecx -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: leal (,%eax,8), %edx -; FALLBACK26-NEXT: andl $24, %edx -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: andl $60, %ecx -; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: notb %dl -; FALLBACK26-NEXT: leal (%edi,%edi), %ebp -; FALLBACK26-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %eax -; FALLBACK26-NEXT: orl %edi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %eax -; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %edi -; FALLBACK26-NEXT: orl %eax, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %eax -; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %eax -; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %edi -; FALLBACK26-NEXT: orl %eax, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %eax -; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %ecx, %ebp -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %eax -; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi -; FALLBACK26-NEXT: orl %edi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %eax -; FALLBACK26-NEXT: orl %ecx, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 120(%esp,%ebp), %edi -; FALLBACK26-NEXT: leal (%edi,%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK26-NEXT: movl 116(%esp,%ebp), %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %ebp -; FALLBACK26-NEXT: orl %ebp, %esi -; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK26-NEXT: orl %ebp, %ecx -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl 124(%esp,%eax), %eax -; FALLBACK26-NEXT: leal (%eax,%eax), %ebp -; FALLBACK26-NEXT: shlxl %edx, %ebp, %edx -; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK26-NEXT: orl %edi, %edx -; FALLBACK26-NEXT: sarxl %ebx, %eax, %edi -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl %edi, 60(%eax) -; FALLBACK26-NEXT: movl %edx, 56(%eax) -; FALLBACK26-NEXT: movl %ecx, 48(%eax) -; FALLBACK26-NEXT: movl %esi, 52(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 40(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 44(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 32(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 36(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 24(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 28(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 16(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 20(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 8(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 12(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, (%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 4(%eax) -; FALLBACK26-NEXT: addl $204, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: vzeroupper -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: ashr_64bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $188, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: vmovups (%eax), %ymm0 -; FALLBACK27-NEXT: vmovups 32(%eax), %xmm1 -; FALLBACK27-NEXT: movl 48(%eax), %edx -; FALLBACK27-NEXT: movl 52(%eax), %esi -; FALLBACK27-NEXT: movl 56(%eax), %edi -; FALLBACK27-NEXT: movl 60(%eax), %eax -; FALLBACK27-NEXT: movl (%ecx), %ecx -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: sarl $31, %eax -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %ecx, %ebp -; FALLBACK27-NEXT: andl $60, %ebp -; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shll $3, %ecx -; FALLBACK27-NEXT: andl $24, %ecx -; FALLBACK27-NEXT: shrdl %cl, %edx, %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %esi -; FALLBACK27-NEXT: shrdl %cl, %edi, %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %edi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl %edi, %edx -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %edi, %esi -; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK27-NEXT: movl %eax, 56(%ebp) -; FALLBACK27-NEXT: movl %esi, 48(%ebp) -; FALLBACK27-NEXT: movl %edx, 52(%ebp) -; FALLBACK27-NEXT: movl %ebx, 40(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 44(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 32(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 36(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 24(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 28(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 16(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 20(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 8(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 12(%ebp) -; FALLBACK27-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %edx, %edi -; FALLBACK27-NEXT: movl %edi, (%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 4(%ebp) -; FALLBACK27-NEXT: movl %eax, 60(%ebp) -; FALLBACK27-NEXT: addl $188, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: vzeroupper -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: ashr_64bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $204, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK28-NEXT: vmovups 32(%ecx), %xmm1 -; FALLBACK28-NEXT: movl 48(%ecx), %edx -; FALLBACK28-NEXT: movl 52(%ecx), %esi -; FALLBACK28-NEXT: movl 56(%ecx), %edi -; FALLBACK28-NEXT: movl 60(%ecx), %ecx -; FALLBACK28-NEXT: movl (%eax), %eax -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: sarl $31, %ecx -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %eax, %esi -; FALLBACK28-NEXT: andl $60, %esi -; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK28-NEXT: shll $3, %eax -; FALLBACK28-NEXT: andl $24, %eax -; FALLBACK28-NEXT: movl %edx, %edi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movb %al, %ch -; FALLBACK28-NEXT: notb %ch -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %edx, %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %edi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK28-NEXT: movl %edx, %ebp -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %edx, %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: addl %eax, %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK28-NEXT: leal (%edx,%edx), %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK28-NEXT: movl %edi, %ebp -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: addl %edi, %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %edx, %edi -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK28-NEXT: movl %esi, %ebx -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK28-NEXT: leal (%eax,%eax), %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %eax, %edx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK28-NEXT: sarl %cl, %ebx -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl %ebx, 60(%eax) -; FALLBACK28-NEXT: movl %edx, 56(%eax) -; FALLBACK28-NEXT: movl %esi, 48(%eax) -; FALLBACK28-NEXT: movl %ebp, 52(%eax) -; FALLBACK28-NEXT: movl %edi, 40(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 44(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 32(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 36(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 24(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 28(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 16(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 20(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 8(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 12(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, (%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 4(%eax) -; FALLBACK28-NEXT: addl $204, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: vzeroupper -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: ashr_64bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $188, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: vmovups (%eax), %ymm0 -; FALLBACK29-NEXT: vmovups 32(%eax), %xmm1 -; FALLBACK29-NEXT: movl 48(%eax), %edx -; FALLBACK29-NEXT: movl 52(%eax), %esi -; FALLBACK29-NEXT: movl 56(%eax), %edi -; FALLBACK29-NEXT: movl 60(%eax), %eax -; FALLBACK29-NEXT: movl (%ecx), %ecx -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: sarl $31, %eax -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ecx, %ebp -; FALLBACK29-NEXT: andl $60, %ebp -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shll $3, %ecx -; FALLBACK29-NEXT: andl $24, %ecx -; FALLBACK29-NEXT: shrdl %cl, %edx, %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %esi -; FALLBACK29-NEXT: shrdl %cl, %edi, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %edi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl %esi, %edx -; FALLBACK29-NEXT: shrdl %cl, %eax, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %esi, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %edx, %edi -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %edx, 56(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK29-NEXT: sarl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 60(%ebp) -; FALLBACK29-NEXT: movl %esi, 48(%ebp) -; FALLBACK29-NEXT: movl %edi, 52(%ebp) -; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 40(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 44(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 32(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 36(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 24(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %ebx, (%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 4(%ebp) -; FALLBACK29-NEXT: addl $188, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: vzeroupper -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: ashr_64bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $204, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: vmovups 32(%ecx), %xmm1 -; FALLBACK30-NEXT: movl 48(%ecx), %edx -; FALLBACK30-NEXT: movl 52(%ecx), %esi -; FALLBACK30-NEXT: movl 56(%ecx), %edi -; FALLBACK30-NEXT: movl 60(%ecx), %ecx -; FALLBACK30-NEXT: movl (%eax), %eax -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: sarl $31, %ecx -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: leal (,%eax,8), %edx -; FALLBACK30-NEXT: andl $24, %edx -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: andl $60, %ecx -; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: notb %dl -; FALLBACK30-NEXT: leal (%edi,%edi), %ebp -; FALLBACK30-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %eax -; FALLBACK30-NEXT: orl %edi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %eax -; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %edi -; FALLBACK30-NEXT: orl %eax, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %eax -; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %eax -; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %edi -; FALLBACK30-NEXT: orl %eax, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %eax -; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %ecx, %ebp -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %eax -; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi -; FALLBACK30-NEXT: orl %edi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %eax -; FALLBACK30-NEXT: orl %ecx, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 120(%esp,%ebp), %edi -; FALLBACK30-NEXT: leal (%edi,%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK30-NEXT: movl 116(%esp,%ebp), %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %ebp -; FALLBACK30-NEXT: orl %ebp, %esi -; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK30-NEXT: orl %ebp, %ecx -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl 124(%esp,%eax), %eax -; FALLBACK30-NEXT: leal (%eax,%eax), %ebp -; FALLBACK30-NEXT: shlxl %edx, %ebp, %edx -; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK30-NEXT: orl %edi, %edx -; FALLBACK30-NEXT: sarxl %ebx, %eax, %edi -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl %edi, 60(%eax) -; FALLBACK30-NEXT: movl %edx, 56(%eax) -; FALLBACK30-NEXT: movl %ecx, 48(%eax) -; FALLBACK30-NEXT: movl %esi, 52(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 40(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 44(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 32(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 36(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 24(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 28(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 16(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 20(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 8(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 12(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, (%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 4(%eax) -; FALLBACK30-NEXT: addl $204, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: vzeroupper -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: ashr_64bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $188, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: vmovups (%eax), %ymm0 -; FALLBACK31-NEXT: vmovups 32(%eax), %xmm1 -; FALLBACK31-NEXT: movl 48(%eax), %edx -; FALLBACK31-NEXT: movl 52(%eax), %esi -; FALLBACK31-NEXT: movl 56(%eax), %edi -; FALLBACK31-NEXT: movl 60(%eax), %eax -; FALLBACK31-NEXT: movl (%ecx), %ecx -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: sarl $31, %eax -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %ecx, %ebp -; FALLBACK31-NEXT: andl $60, %ebp -; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shll $3, %ecx -; FALLBACK31-NEXT: andl $24, %ecx -; FALLBACK31-NEXT: shrdl %cl, %edx, %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %esi -; FALLBACK31-NEXT: shrdl %cl, %edi, %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %edi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl %edi, %edx -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %edi, %esi -; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK31-NEXT: movl %eax, 56(%ebp) -; FALLBACK31-NEXT: movl %esi, 48(%ebp) -; FALLBACK31-NEXT: movl %edx, 52(%ebp) -; FALLBACK31-NEXT: movl %ebx, 40(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 44(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 32(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 36(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 24(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 28(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 16(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 20(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 8(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 12(%ebp) -; FALLBACK31-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %edx, %edi -; FALLBACK31-NEXT: movl %edi, (%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 4(%ebp) -; FALLBACK31-NEXT: movl %eax, 60(%ebp) -; FALLBACK31-NEXT: addl $188, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: vzeroupper -; FALLBACK31-NEXT: retl +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %edi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rdi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %edi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %r8, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r15, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rdi), %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r13,%r13), %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r12, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r14, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rdi,%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r13, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r10, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r15,%r15), %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r12, %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r15, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rax,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r14, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r12, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rdi), %rax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %edi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rcx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rdi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %edi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%rdi), %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r14, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%rdi), %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%rbp,%rbp), %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r13, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r14, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r12, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%rdi,%rdi), %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbp, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq $8, %rsp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rdi), %rcx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r14,%r14), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rbx, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r13, %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r15, %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r14, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r12, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rax,%rax), %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r15, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r9, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r13, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rdi), %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %r11, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rdi), %rax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl (%rsi), %edi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rcx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rdi,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andl $56, %edi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -128(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -120(%rsp,%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -104(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -96(%rsp,%rdi), %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r12,%r12), %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -112(%rsp,%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r14, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -88(%rsp,%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r14, %r13 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -80(%rsp,%rdi), %rbp +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%rbp,%rbp), %r15 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r13, %r15 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r14, %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r12, %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rbp +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%rdi,%rdi), %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %rbp, %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r12, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r14, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq $8, %rsp +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbp +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rdi), %rcx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r14,%r14), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rbx, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r12,%r12), %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r13, %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r15, %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r14, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r12, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r15, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r14, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r10, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r15, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r13, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rdi), %rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %r11, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ebp), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: sarl $31, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%esi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edx, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%esi,%esi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%edx), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 96(%esp,%eax), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edx, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 100(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 104(%esp,%edx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%esi,%esi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 112(%esp,%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ecx,%ecx), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 116(%esp,%edx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 120(%esp,%edx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edx,%edx), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: sarl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%eax), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarl $31, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarl $31, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%eax,8), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 96(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 92(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 104(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 100(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 112(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 108(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 120(%esp,%ebp), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 116(%esp,%ebp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 124(%esp,%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 52(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarl $31, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: sarl $31, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 96(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 100(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 104(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 108(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 112(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 116(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 120(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: sarl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%eax), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%eax), %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%eax), %xmm2 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarl $31, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarl $31, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%eax,8), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 96(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 92(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 104(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 100(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 112(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 108(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 120(%esp,%ebp), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%edi,%edi), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 116(%esp,%ebp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 124(%esp,%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 52(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%eax), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%eax), %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%eax), %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarl $31, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes: +; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%ecx), %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 48(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 52(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 56(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 60(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 96(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 100(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 104(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 108(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 112(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 116(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 120(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%eax), %ymm0 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%eax), %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%ecx), %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl (%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%eax,8), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $60, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 96(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 92(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 104(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 100(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 112(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 108(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 120(%esp,%ebp), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%edi,%edi), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 116(%esp,%ebp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 124(%esp,%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 52(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%eax), %ymm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%eax), %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3