diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index df7d227d31db6d..be77051cf82e1c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30730,6 +30730,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(V); return; } + case ISD::BITREVERSE: + assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); + assert(Subtarget.hasXOP() && "Expected XOP"); + // We can use VPPERM by copying to a vector register and back. We'll need + // to move the scalar in two i32 pieces. + Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG)); + return; } } diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll index 343d9fb2da2de5..8e2f6f9b463b0d 100644 --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86 ; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+xop | FileCheck %s --check-prefixes=X86XOP ; These tests just check that the plumbing is in place for @llvm.bitreverse. The ; actual output is massive at the moment as llvm.bitreverse is not yet legal. @@ -75,6 +76,11 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind { ; X64-NEXT: psrlw $1, %xmm0 ; X64-NEXT: por %xmm1, %xmm0 ; X64-NEXT: retq +; +; X86XOP-LABEL: test_bitreverse_v2i16: +; X86XOP: # %bb.0: +; X86XOP-NEXT: vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0 +; X86XOP-NEXT: retl %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a) ret <2 x i16> %b } @@ -145,6 +151,14 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind { ; X64-NEXT: shrq %rdx ; X64-NEXT: leaq (%rdx,%rcx,2), %rax ; X64-NEXT: retq +; +; X86XOP-LABEL: test_bitreverse_i64: +; X86XOP: # %bb.0: +; X86XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86XOP-NEXT: vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0 +; X86XOP-NEXT: vmovd %xmm0, %eax +; X86XOP-NEXT: vpextrd $1, %xmm0, %edx +; X86XOP-NEXT: retl %b = call i64 @llvm.bitreverse.i64(i64 %a) ret i64 %b } @@ -195,6 +209,13 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; X64-NEXT: shrl %eax ; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: retq +; +; X86XOP-LABEL: test_bitreverse_i32: +; X86XOP: # %bb.0: +; X86XOP-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86XOP-NEXT: vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0 +; X86XOP-NEXT: vmovd %xmm0, %eax +; X86XOP-NEXT: retl %b = call i32 @llvm.bitreverse.i32(i32 %a) ret i32 %b } @@ -247,6 +268,14 @@ define i24 @test_bitreverse_i24(i24 %a) nounwind { ; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: shrl $8, %eax ; X64-NEXT: retq +; +; X86XOP-LABEL: test_bitreverse_i24: +; X86XOP: # %bb.0: +; X86XOP-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86XOP-NEXT: vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0 +; X86XOP-NEXT: vmovd %xmm0, %eax +; X86XOP-NEXT: shrl $8, %eax +; X86XOP-NEXT: retl %b = call i24 @llvm.bitreverse.i24(i24 %a) ret i24 %b } @@ -299,6 +328,14 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq +; +; X86XOP-LABEL: test_bitreverse_i16: +; X86XOP: # %bb.0: +; X86XOP-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86XOP-NEXT: vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0 +; X86XOP-NEXT: vmovd %xmm0, %eax +; X86XOP-NEXT: # kill: def $ax killed $ax killed $eax +; X86XOP-NEXT: retl %b = call i16 @llvm.bitreverse.i16(i16 %a) ret i16 %b } @@ -342,6 +379,14 @@ define i8 @test_bitreverse_i8(i8 %a) { ; X64-NEXT: addl %edi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq +; +; X86XOP-LABEL: test_bitreverse_i8: +; X86XOP: # %bb.0: +; X86XOP-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86XOP-NEXT: vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0 +; X86XOP-NEXT: vmovd %xmm0, %eax +; X86XOP-NEXT: # kill: def $al killed $al killed $eax +; X86XOP-NEXT: retl %b = call i8 @llvm.bitreverse.i8(i8 %a) ret i8 %b } @@ -387,6 +432,15 @@ define i4 @test_bitreverse_i4(i4 %a) { ; X64-NEXT: shrb $4, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq +; +; X86XOP-LABEL: test_bitreverse_i4: +; X86XOP: # %bb.0: +; X86XOP-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86XOP-NEXT: vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0 +; X86XOP-NEXT: vmovd %xmm0, %eax +; X86XOP-NEXT: shrb $4, %al +; X86XOP-NEXT: # kill: def $al killed $al killed $eax +; X86XOP-NEXT: retl %b = call i4 @llvm.bitreverse.i4(i4 %a) ret i4 %b } @@ -404,6 +458,11 @@ define <2 x i16> @fold_v2i16() { ; X64: # %bb.0: ; X64-NEXT: movaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u> ; X64-NEXT: retq +; +; X86XOP-LABEL: fold_v2i16: +; X86XOP: # %bb.0: +; X86XOP-NEXT: vmovaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u> +; X86XOP-NEXT: retl %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> ) ret <2 x i16> %b } @@ -418,6 +477,11 @@ define i24 @fold_i24() { ; X64: # %bb.0: ; X64-NEXT: movl $2048, %eax # imm = 0x800 ; X64-NEXT: retq +; +; X86XOP-LABEL: fold_i24: +; X86XOP: # %bb.0: +; X86XOP-NEXT: movl $2048, %eax # imm = 0x800 +; X86XOP-NEXT: retl %b = call i24 @llvm.bitreverse.i24(i24 4096) ret i24 %b } @@ -432,6 +496,11 @@ define i8 @fold_i8() { ; X64: # %bb.0: ; X64-NEXT: movb $-16, %al ; X64-NEXT: retq +; +; X86XOP-LABEL: fold_i8: +; X86XOP: # %bb.0: +; X86XOP-NEXT: movb $-16, %al +; X86XOP-NEXT: retl %b = call i8 @llvm.bitreverse.i8(i8 15) ret i8 %b } @@ -446,6 +515,11 @@ define i4 @fold_i4() { ; X64: # %bb.0: ; X64-NEXT: movb $1, %al ; X64-NEXT: retq +; +; X86XOP-LABEL: fold_i4: +; X86XOP: # %bb.0: +; X86XOP-NEXT: movb $1, %al +; X86XOP-NEXT: retl %b = call i4 @llvm.bitreverse.i4(i4 8) ret i4 %b } @@ -463,6 +537,11 @@ define i8 @identity_i8(i8 %a) { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq +; +; X86XOP-LABEL: identity_i8: +; X86XOP: # %bb.0: +; X86XOP-NEXT: movb {{[0-9]+}}(%esp), %al +; X86XOP-NEXT: retl %b = call i8 @llvm.bitreverse.i8(i8 %a) %c = call i8 @llvm.bitreverse.i8(i8 %b) ret i8 %c @@ -478,6 +557,10 @@ define <2 x i16> @identity_v2i16(<2 x i16> %a) { ; X64-LABEL: identity_v2i16: ; X64: # %bb.0: ; X64-NEXT: retq +; +; X86XOP-LABEL: identity_v2i16: +; X86XOP: # %bb.0: +; X86XOP-NEXT: retl %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a) %c = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %b) ret <2 x i16> %c @@ -493,6 +576,10 @@ define i8 @undef_i8() { ; X64-LABEL: undef_i8: ; X64: # %bb.0: ; X64-NEXT: retq +; +; X86XOP-LABEL: undef_i8: +; X86XOP: # %bb.0: +; X86XOP-NEXT: retl %b = call i8 @llvm.bitreverse.i8(i8 undef) ret i8 %b } @@ -505,6 +592,10 @@ define <2 x i16> @undef_v2i16() { ; X64-LABEL: undef_v2i16: ; X64: # %bb.0: ; X64-NEXT: retq +; +; X86XOP-LABEL: undef_v2i16: +; X86XOP: # %bb.0: +; X86XOP-NEXT: retl %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> undef) ret <2 x i16> %b } @@ -1122,6 +1213,113 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64-NEXT: popq %r15 ; X64-NEXT: popq %rbp ; X64-NEXT: retq +; +; X86XOP-LABEL: large_promotion: +; X86XOP: # %bb.0: +; X86XOP-NEXT: pushl %ebp +; X86XOP-NEXT: pushl %ebx +; X86XOP-NEXT: pushl %edi +; X86XOP-NEXT: pushl %esi +; X86XOP-NEXT: subl $44, %esp +; X86XOP-NEXT: vmovdqa {{.*#+}} xmm0 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] +; X86XOP-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm1 +; X86XOP-NEXT: vpextrd $1, %xmm1, %eax +; X86XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm1 +; X86XOP-NEXT: vmovd %xmm1, %ecx +; X86XOP-NEXT: shrdl $16, %ecx, %eax +; X86XOP-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86XOP-NEXT: vpextrd $1, %xmm1, %eax +; X86XOP-NEXT: shrdl $16, %eax, %ecx +; X86XOP-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm1 +; X86XOP-NEXT: vmovd %xmm1, %ecx +; X86XOP-NEXT: shrdl $16, %ecx, %eax +; X86XOP-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86XOP-NEXT: vpextrd $1, %xmm1, %eax +; X86XOP-NEXT: shrdl $16, %eax, %ecx +; X86XOP-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm1 +; X86XOP-NEXT: vmovd %xmm1, %ecx +; X86XOP-NEXT: shrdl $16, %ecx, %eax +; X86XOP-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86XOP-NEXT: vpextrd $1, %xmm1, %eax +; X86XOP-NEXT: shrdl $16, %eax, %ecx +; X86XOP-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm1 +; X86XOP-NEXT: vmovd %xmm1, %ecx +; X86XOP-NEXT: shrdl $16, %ecx, %eax +; X86XOP-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86XOP-NEXT: vpextrd $1, %xmm1, %eax +; X86XOP-NEXT: shrdl $16, %eax, %ecx +; X86XOP-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm1 +; X86XOP-NEXT: vmovd %xmm1, %ecx +; X86XOP-NEXT: shrdl $16, %ecx, %eax +; X86XOP-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86XOP-NEXT: vpextrd $1, %xmm1, %eax +; X86XOP-NEXT: shrdl $16, %eax, %ecx +; X86XOP-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm1 +; X86XOP-NEXT: vmovd %xmm1, %ebp +; X86XOP-NEXT: shrdl $16, %ebp, %eax +; X86XOP-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86XOP-NEXT: vpextrd $1, %xmm1, %ebx +; X86XOP-NEXT: shrdl $16, %ebx, %ebp +; X86XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm1 +; X86XOP-NEXT: vmovd %xmm1, %esi +; X86XOP-NEXT: shrdl $16, %esi, %ebx +; X86XOP-NEXT: vpextrd $1, %xmm1, %edx +; X86XOP-NEXT: shrdl $16, %edx, %esi +; X86XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm0 +; X86XOP-NEXT: vmovd %xmm0, %ecx +; X86XOP-NEXT: shrdl $16, %ecx, %edx +; X86XOP-NEXT: vpextrd $1, %xmm0, %edi +; X86XOP-NEXT: shrdl $16, %edi, %ecx +; X86XOP-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86XOP-NEXT: movl %ecx, 60(%eax) +; X86XOP-NEXT: movl %edx, 56(%eax) +; X86XOP-NEXT: movl %esi, 52(%eax) +; X86XOP-NEXT: movl %ebx, 48(%eax) +; X86XOP-NEXT: movl %ebp, 44(%eax) +; X86XOP-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86XOP-NEXT: movl %ecx, 40(%eax) +; X86XOP-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86XOP-NEXT: movl %ecx, 36(%eax) +; X86XOP-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86XOP-NEXT: movl %ecx, 32(%eax) +; X86XOP-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86XOP-NEXT: movl %ecx, 28(%eax) +; X86XOP-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86XOP-NEXT: movl %ecx, 24(%eax) +; X86XOP-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86XOP-NEXT: movl %ecx, 20(%eax) +; X86XOP-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86XOP-NEXT: movl %ecx, 16(%eax) +; X86XOP-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86XOP-NEXT: movl %ecx, 12(%eax) +; X86XOP-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86XOP-NEXT: movl %ecx, 8(%eax) +; X86XOP-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86XOP-NEXT: movl %ecx, 4(%eax) +; X86XOP-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86XOP-NEXT: movl %ecx, (%eax) +; X86XOP-NEXT: shrl $16, %edi +; X86XOP-NEXT: movw %di, 64(%eax) +; X86XOP-NEXT: addl $44, %esp +; X86XOP-NEXT: popl %esi +; X86XOP-NEXT: popl %edi +; X86XOP-NEXT: popl %ebx +; X86XOP-NEXT: popl %ebp +; X86XOP-NEXT: retl $4 %Z = call i528 @llvm.bitreverse.i528(i528 %A) ret i528 %Z }