-
Notifications
You must be signed in to change notification settings - Fork 10.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use 256 bit register when AVX2 or higher is available. #91721
base: main
Are you sure you want to change the base?
Use 256 bit register when AVX2 or higher is available. #91721
Conversation
@llvm/pr-subscribers-backend-x86 Author: None (shamithoke) ChangesBased on my internal tests, using 256 bit register is faster (when available). Full diff: https://github.com/llvm/llvm-project/pull/91721.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a811ce43422ec..bbacc146abe98 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31363,10 +31363,13 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
assert(
(VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
"Only tested for i8/i16/i32/i64");
- MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
+ unsigned int VecLen = Subtarget.hasAVX2() ? 256 : 128;
+ MVT CharVecVT = Subtarget.hasAVX2() ? MVT::v32i8 : MVT::v16i8;
+
+ MVT VecVT = MVT::getVectorVT(VT, VecLen / VT.getSizeInBits());
SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
- Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
- DAG.getBitcast(MVT::v16i8, Res));
+ Res = DAG.getNode(ISD::BITREVERSE, DL, CharVecVT,
+ DAG.getBitcast(CharVecVT, Res));
Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
DAG.getBitcast(VecVT, Res), DAG.getIntPtrConstant(0, DL));
return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 4f2654843728f..207878408e57e 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -173,9 +173,10 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
; GFNI-LABEL: test_bitreverse_i64:
; GFNI: # %bb.0:
; GFNI-NEXT: vmovq %rdi, %xmm0
-; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; GFNI-NEXT: vmovq %xmm0, %rax
; GFNI-NEXT: bswapq %rax
+; GFNI-NEXT: vzeroupper
; GFNI-NEXT: retq
%b = call i64 @llvm.bitreverse.i64(i64 %a)
ret i64 %b
@@ -238,9 +239,10 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; GFNI-LABEL: test_bitreverse_i32:
; GFNI: # %bb.0:
; GFNI-NEXT: vmovd %edi, %xmm0
-; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; GFNI-NEXT: vmovd %xmm0, %eax
; GFNI-NEXT: bswapl %eax
+; GFNI-NEXT: vzeroupper
; GFNI-NEXT: retq
%b = call i32 @llvm.bitreverse.i32(i32 %a)
ret i32 %b
@@ -306,10 +308,11 @@ define i24 @test_bitreverse_i24(i24 %a) nounwind {
; GFNI-LABEL: test_bitreverse_i24:
; GFNI: # %bb.0:
; GFNI-NEXT: vmovd %edi, %xmm0
-; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; GFNI-NEXT: vmovd %xmm0, %eax
; GFNI-NEXT: bswapl %eax
; GFNI-NEXT: shrl $8, %eax
+; GFNI-NEXT: vzeroupper
; GFNI-NEXT: retq
%b = call i24 @llvm.bitreverse.i24(i24 %a)
ret i24 %b
@@ -375,10 +378,11 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; GFNI-LABEL: test_bitreverse_i16:
; GFNI: # %bb.0:
; GFNI-NEXT: vmovd %edi, %xmm0
-; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; GFNI-NEXT: vmovd %xmm0, %eax
; GFNI-NEXT: rolw $8, %ax
; GFNI-NEXT: # kill: def $ax killed $ax killed $eax
+; GFNI-NEXT: vzeroupper
; GFNI-NEXT: retq
%b = call i16 @llvm.bitreverse.i16(i16 %a)
ret i16 %b
@@ -433,9 +437,10 @@ define i8 @test_bitreverse_i8(i8 %a) {
; GFNI-LABEL: test_bitreverse_i8:
; GFNI: # %bb.0:
; GFNI-NEXT: vmovd %edi, %xmm0
-; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; GFNI-NEXT: vmovd %xmm0, %eax
; GFNI-NEXT: # kill: def $al killed $al killed $eax
+; GFNI-NEXT: vzeroupper
; GFNI-NEXT: retq
%b = call i8 @llvm.bitreverse.i8(i8 %a)
ret i8 %b
@@ -492,10 +497,11 @@ define i4 @test_bitreverse_i4(i4 %a) {
; GFNI-LABEL: test_bitreverse_i4:
; GFNI: # %bb.0:
; GFNI-NEXT: vmovd %edi, %xmm0
-; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; GFNI-NEXT: vmovd %xmm0, %eax
; GFNI-NEXT: shrb $4, %al
; GFNI-NEXT: # kill: def $al killed $al killed $eax
+; GFNI-NEXT: vzeroupper
; GFNI-NEXT: retq
%b = call i4 @llvm.bitreverse.i4(i4 %a)
ret i4 %b
@@ -1340,48 +1346,48 @@ define i528 @large_promotion(i528 %A) nounwind {
; GFNI-NEXT: pushq %r14
; GFNI-NEXT: pushq %rbx
; GFNI-NEXT: movq %rdi, %rax
-; GFNI-NEXT: vpbroadcastq {{.*#+}} xmm0 = [9241421688590303745,9241421688590303745]
; GFNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNI-NEXT: vgf2p8affineqb $0, %ymm0, %ymm1, %ymm1
; GFNI-NEXT: vmovq %xmm1, %r10
; GFNI-NEXT: bswapq %r10
; GFNI-NEXT: vmovq %r9, %xmm1
-; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vgf2p8affineqb $0, %ymm0, %ymm1, %ymm1
; GFNI-NEXT: vmovq %xmm1, %rdi
; GFNI-NEXT: bswapq %rdi
; GFNI-NEXT: vmovq %r8, %xmm1
-; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vgf2p8affineqb $0, %ymm0, %ymm1, %ymm1
; GFNI-NEXT: vmovq %xmm1, %r8
; GFNI-NEXT: bswapq %r8
; GFNI-NEXT: movq %r8, %r9
; GFNI-NEXT: shldq $16, %rdi, %r9
; GFNI-NEXT: shldq $16, %r10, %rdi
; GFNI-NEXT: vmovq %rcx, %xmm1
-; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vgf2p8affineqb $0, %ymm0, %ymm1, %ymm1
; GFNI-NEXT: vmovq %xmm1, %rcx
; GFNI-NEXT: bswapq %rcx
; GFNI-NEXT: shrdq $48, %rcx, %r8
; GFNI-NEXT: vmovq %rdx, %xmm1
-; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vgf2p8affineqb $0, %ymm0, %ymm1, %ymm1
; GFNI-NEXT: vmovq %xmm1, %rdx
; GFNI-NEXT: bswapq %rdx
; GFNI-NEXT: shrdq $48, %rdx, %rcx
; GFNI-NEXT: vmovq %rsi, %xmm1
-; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vgf2p8affineqb $0, %ymm0, %ymm1, %ymm1
; GFNI-NEXT: vmovq %xmm1, %rsi
; GFNI-NEXT: bswapq %rsi
; GFNI-NEXT: shrdq $48, %rsi, %rdx
; GFNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vgf2p8affineqb $0, %ymm0, %ymm1, %ymm1
; GFNI-NEXT: vmovq %xmm1, %r11
; GFNI-NEXT: bswapq %r11
; GFNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
+; GFNI-NEXT: vgf2p8affineqb $0, %ymm0, %ymm1, %ymm1
; GFNI-NEXT: vmovq %xmm1, %rbx
; GFNI-NEXT: bswapq %rbx
; GFNI-NEXT: shrdq $48, %rbx, %r11
; GFNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm0
+; GFNI-NEXT: vgf2p8affineqb $0, %ymm0, %ymm1, %ymm0
; GFNI-NEXT: vmovq %xmm0, %r14
; GFNI-NEXT: bswapq %r14
; GFNI-NEXT: shrdq $48, %r14, %rbx
@@ -1398,6 +1404,7 @@ define i528 @large_promotion(i528 %A) nounwind {
; GFNI-NEXT: movw %si, 64(%rax)
; GFNI-NEXT: popq %rbx
; GFNI-NEXT: popq %r14
+; GFNI-NEXT: vzeroupper
; GFNI-NEXT: retq
%Z = call i528 @llvm.bitreverse.i528(i528 %A)
ret i528 %Z
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 90cc3d5fdde82..e950955dff80a 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -67,13 +67,33 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; GFNISSE-NEXT: # kill: def $al killed $al killed $eax
; GFNISSE-NEXT: retq
;
-; GFNIAVX-LABEL: test_bitreverse_i8:
-; GFNIAVX: # %bb.0:
-; GFNIAVX-NEXT: vmovd %edi, %xmm0
-; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX-NEXT: vmovd %xmm0, %eax
-; GFNIAVX-NEXT: # kill: def $al killed $al killed $eax
-; GFNIAVX-NEXT: retq
+; GFNIAVX1-LABEL: test_bitreverse_i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vmovd %edi, %xmm0
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1-NEXT: vmovd %xmm0, %eax
+; GFNIAVX1-NEXT: # kill: def $al killed $al killed $eax
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vmovd %edi, %xmm0
+; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vmovd %xmm0, %eax
+; GFNIAVX2-NEXT: # kill: def $al killed $al killed $eax
+; GFNIAVX2-NEXT: vzeroupper
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512-LABEL: test_bitreverse_i8:
+; GFNIAVX512: # %bb.0:
+; GFNIAVX512-NEXT: vmovd %edi, %xmm0
+; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512-NEXT: vmovd %xmm0, %eax
+; GFNIAVX512-NEXT: # kill: def $al killed $al killed $eax
+; GFNIAVX512-NEXT: vzeroupper
+; GFNIAVX512-NEXT: retq
%b = call i8 @llvm.bitreverse.i8(i8 %a)
ret i8 %b
}
@@ -142,14 +162,36 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; GFNISSE-NEXT: # kill: def $ax killed $ax killed $eax
; GFNISSE-NEXT: retq
;
-; GFNIAVX-LABEL: test_bitreverse_i16:
-; GFNIAVX: # %bb.0:
-; GFNIAVX-NEXT: vmovd %edi, %xmm0
-; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX-NEXT: vmovd %xmm0, %eax
-; GFNIAVX-NEXT: rolw $8, %ax
-; GFNIAVX-NEXT: # kill: def $ax killed $ax killed $eax
-; GFNIAVX-NEXT: retq
+; GFNIAVX1-LABEL: test_bitreverse_i16:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vmovd %edi, %xmm0
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1-NEXT: vmovd %xmm0, %eax
+; GFNIAVX1-NEXT: rolw $8, %ax
+; GFNIAVX1-NEXT: # kill: def $ax killed $ax killed $eax
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_i16:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vmovd %edi, %xmm0
+; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vmovd %xmm0, %eax
+; GFNIAVX2-NEXT: rolw $8, %ax
+; GFNIAVX2-NEXT: # kill: def $ax killed $ax killed $eax
+; GFNIAVX2-NEXT: vzeroupper
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512-LABEL: test_bitreverse_i16:
+; GFNIAVX512: # %bb.0:
+; GFNIAVX512-NEXT: vmovd %edi, %xmm0
+; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512-NEXT: vmovd %xmm0, %eax
+; GFNIAVX512-NEXT: rolw $8, %ax
+; GFNIAVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; GFNIAVX512-NEXT: vzeroupper
+; GFNIAVX512-NEXT: retq
%b = call i16 @llvm.bitreverse.i16(i16 %a)
ret i16 %b
}
@@ -214,13 +256,33 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; GFNISSE-NEXT: bswapl %eax
; GFNISSE-NEXT: retq
;
-; GFNIAVX-LABEL: test_bitreverse_i32:
-; GFNIAVX: # %bb.0:
-; GFNIAVX-NEXT: vmovd %edi, %xmm0
-; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX-NEXT: vmovd %xmm0, %eax
-; GFNIAVX-NEXT: bswapl %eax
-; GFNIAVX-NEXT: retq
+; GFNIAVX1-LABEL: test_bitreverse_i32:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vmovd %edi, %xmm0
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1-NEXT: vmovd %xmm0, %eax
+; GFNIAVX1-NEXT: bswapl %eax
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_i32:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vmovd %edi, %xmm0
+; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vmovd %xmm0, %eax
+; GFNIAVX2-NEXT: bswapl %eax
+; GFNIAVX2-NEXT: vzeroupper
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512-LABEL: test_bitreverse_i32:
+; GFNIAVX512: # %bb.0:
+; GFNIAVX512-NEXT: vmovd %edi, %xmm0
+; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512-NEXT: vmovd %xmm0, %eax
+; GFNIAVX512-NEXT: bswapl %eax
+; GFNIAVX512-NEXT: vzeroupper
+; GFNIAVX512-NEXT: retq
%b = call i32 @llvm.bitreverse.i32(i32 %a)
ret i32 %b
}
@@ -289,13 +351,33 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
; GFNISSE-NEXT: bswapq %rax
; GFNISSE-NEXT: retq
;
-; GFNIAVX-LABEL: test_bitreverse_i64:
-; GFNIAVX: # %bb.0:
-; GFNIAVX-NEXT: vmovq %rdi, %xmm0
-; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX-NEXT: vmovq %xmm0, %rax
-; GFNIAVX-NEXT: bswapq %rax
-; GFNIAVX-NEXT: retq
+; GFNIAVX1-LABEL: test_bitreverse_i64:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vmovq %rdi, %xmm0
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1-NEXT: vmovq %xmm0, %rax
+; GFNIAVX1-NEXT: bswapq %rax
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: test_bitreverse_i64:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vmovq %rdi, %xmm0
+; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vmovq %xmm0, %rax
+; GFNIAVX2-NEXT: bswapq %rax
+; GFNIAVX2-NEXT: vzeroupper
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512-LABEL: test_bitreverse_i64:
+; GFNIAVX512: # %bb.0:
+; GFNIAVX512-NEXT: vmovq %rdi, %xmm0
+; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512-NEXT: vmovq %xmm0, %rax
+; GFNIAVX512-NEXT: bswapq %rax
+; GFNIAVX512-NEXT: vzeroupper
+; GFNIAVX512-NEXT: retq
%b = call i64 @llvm.bitreverse.i64(i64 %a)
ret i64 %b
}
|
What CPUs was this tested on? |
Icelake |
I can't see anything in Agner/instlatx64/uops.info to match this, and it'd be very unusual for a 256-bit instruction to be faster than the 128-bit variant. Do you have a benchmark that I can run on my rocketlake machine? @phoebewang Do you have any ideas what could be going on? |
I have a java test (that is taken through an internal JIT compiler) and I measured performance for this test. I observe roughly 5% improvement while using AVX2. |
I don't know what's the general benchmark used by community. I can try it with our internal benchmarks. |
The result shows only two benchmarks are affected, which have both 0.15% improvement, hard to tell if they are noise. |
Oh! That is very minimal change. In my case, the improvement is in 3-5% range. |
@RKSimon, @phoebewang |
I'd really like to see the codegen diff for a benchmark that shows a real perf differences - my gut feeling is something else is being combined as a sideeffect and its nothing to do with the GFNI instruction by itself. |
Based on my internal tests, using 256 bit register is faster (when available).