-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enable customer lowering for fabs_v16f16 with AVX2 #72914
Conversation
@llvm/pr-subscribers-backend-x86 Author: David Li (david-xl) ChangesThis is part-2 change to improve codegen for vec_fabs. In this patch, v16f16 and v132f16 fabs are improved. There will be at least two followups patches after this one.
Patch is 108.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72914.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index deeab311320f0a2..5ec9b18f458a5bf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1596,6 +1596,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STORE, VT, Custom);
}
setF16Action(MVT::v16f16, Expand);
+ if (Subtarget.hasAVX2())
+ setOperationAction(ISD::FABS, MVT::v16f16, Custom);
setOperationAction(ISD::FADD, MVT::v16f16, Expand);
setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
@@ -1695,6 +1697,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
}
// This block controls legalization for 512-bit operations with 8/16/32/64 bit
diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll
index f691cb76bc684e6..ececfce210f563d 100644
--- a/llvm/test/CodeGen/X86/vec_fabs.ll
+++ b/llvm/test/CodeGen/X86/vec_fabs.ll
@@ -515,564 +515,17 @@ define <16 x half> @fabs_v16f16(ptr %p) {
;
; X86-AVX2-LABEL: fabs_v16f16:
; X86-AVX2: # %bb.0:
-; X86-AVX2-NEXT: pushl %esi
-; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT: subl $372, %esp # imm = 0x174
-; X86-AVX2-NEXT: .cfi_def_cfa_offset 380
-; X86-AVX2-NEXT: .cfi_offset %esi, -8
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-AVX2-NEXT: vmovdqa (%esi), %xmm0
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovaps 16(%esi), %xmm1
-; X86-AVX2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
-; X86-AVX2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT: vpinsrw $0, 4(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT: vpinsrw $0, 20(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT: vpinsrw $0, 8(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT: vpinsrw $0, 24(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT: vpinsrw $0, 12(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT: vpinsrw $0, 28(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __extendhfsf2
-; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vpand {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT: vmovss %xmm1, (%esp)
-; X86-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; X86-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT: vzeroupper
-; X86-AVX2-NEXT: calll __truncsfhf2
-; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X86-AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X86-AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X86-AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; X86-AVX2-NEXT: addl $372, %esp # imm = 0x174
-; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT: popl %esi
-; X86-AVX2-NEXT: .cfi_def_cfa_offset 4
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X86-AVX2-NEXT: vpand (%eax), %ymm0, %ymm0
; X86-AVX2-NEXT: retl
;
-; X86-AVX512VL-LABEL: fabs_v16f16:
-; X86-AVX512VL: # %bb.0:
-; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512VL-NEXT: movzwl 28(%eax), %ecx
-; X86-AVX512VL-NEXT: vmovd %ecx, %xmm0
-; X86-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm1
-; X86-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
-; X86-AVX512VL-NEXT: vpand %xmm0, %xmm1, %xmm1
-; X86-AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; X86-AVX512VL-NEXT: vmovd %xmm1, %ecx
-; X86-AVX512VL-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm3
-; X86-AVX512VL-NEXT: vmovdqa (%eax), %xmm1
-; X86-AVX512VL-NEXT: vmovdqa 16(%eax), %xmm2
-; X86-AVX512VL-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VL-NEXT: vpextrw $0, %xmm4, %ecx
-; X86-AVX512VL-NEXT: movzwl %cx, %ecx
-; X86-AVX512VL-NEXT: vmovd %ecx, %xmm4
-; X86-AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VL-NEXT: vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VL-NEXT: vmovd %xmm4, %ecx
-; X86-AVX512VL-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-AVX512VL-NEXT: movzwl 12(%eax), %ecx
-; X86-AVX512VL-NEXT: vmovd %ecx, %xmm4
-; X86-AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VL-NEXT: vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VL-NEXT: vmovd %xmm4, %ecx
-; X86-AVX512VL-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VL-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VL-NEXT: vpextrw $0, %xmm5, %ecx
-; X86-AVX512VL-NEXT: movzwl %cx, %ecx
-; X86-AVX512VL-NEXT: vmovd %ecx, %xmm5
-; X86-AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT: vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT: vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT: vmovd %xmm5, %ecx
-; X86-AVX512VL-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; X86-AVX512VL-NEXT: movzwl 24(%eax), %ecx
-; X86-AVX512VL-NEXT: vmovd %ecx, %xmm4
-; X86-AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VL-NEXT: vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VL-NEXT: vmovd %xmm4, %ecx
-; X86-AVX512VL-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VL-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VL-NEXT: vpextrw $0, %xmm5, %ecx
-; X86-AVX512VL-NEXT: movzwl %cx, %ecx
-; X86-AVX512VL-NEXT: vmovd %ecx, %xmm5
-; X86-AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT: vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT: vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-N...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This is part-2 change to improve codegen for vec_fabs. In this patch, v16f16 and v132f16 fabs are improved.
There will be at least two followups patches after this one.