Skip to content

Conversation

@RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Dec 1, 2025

Similar to fdiv, we should be trying to concat these high latency instructions together

… together

Similar to fdiv, we should be trying to concat these high latency instructions together
@llvmbot
Copy link
Member

llvmbot commented Dec 1, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

Similar to fdiv, we should be trying to concat these high latency instructions together


Full diff: https://github.com/llvm/llvm-project/pull/170113.diff

2 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+6)
  • (modified) llvm/test/CodeGen/X86/combine-fsqrt.ll (+11-12)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1b0bf6823e390..9e20807ddf304 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59458,6 +59458,12 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
                            ConcatSubOperand(VT, Ops, 1));
       }
       break;
+    case ISD::FSQRT:
+      if (!IsSplat && (VT.is256BitVector() ||
+                       (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
+        return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
+      }
+      break;
     case X86ISD::HADD:
     case X86ISD::HSUB:
     case X86ISD::FHADD:
diff --git a/llvm/test/CodeGen/X86/combine-fsqrt.ll b/llvm/test/CodeGen/X86/combine-fsqrt.ll
index ddd7d3ac24315..da1dcec231f19 100644
--- a/llvm/test/CodeGen/X86/combine-fsqrt.ll
+++ b/llvm/test/CodeGen/X86/combine-fsqrt.ll
@@ -14,9 +14,9 @@ define <8 x float> @concat_sqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
 ;
 ; AVX-LABEL: concat_sqrt_v8f32_v4f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtps %xmm0, %xmm0
-; AVX-NEXT:    vsqrtps %xmm1, %xmm1
+; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vsqrtps %ymm0, %ymm0
 ; AVX-NEXT:    retq
   %v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
   %v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1)
@@ -35,23 +35,22 @@ define <16 x float> @concat_sqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1,
 ;
 ; AVX1OR2-LABEL: concat_sqrt_v16f32_v4f32:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vsqrtps %xmm0, %xmm0
-; AVX1OR2-NEXT:    vsqrtps %xmm1, %xmm1
-; AVX1OR2-NEXT:    vsqrtps %xmm2, %xmm2
-; AVX1OR2-NEXT:    vsqrtps %xmm3, %xmm3
+; AVX1OR2-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1OR2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 ; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vsqrtps %ymm0, %ymm0
 ; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    vsqrtps %ymm1, %ymm1
 ; AVX1OR2-NEXT:    retq
 ;
 ; AVX512-LABEL: concat_sqrt_v16f32_v4f32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vsqrtps %xmm0, %xmm0
-; AVX512-NEXT:    vsqrtps %xmm1, %xmm1
-; AVX512-NEXT:    vsqrtps %xmm2, %xmm2
-; AVX512-NEXT:    vsqrtps %xmm3, %xmm3
+; AVX512-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 ; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    vsqrtps %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
   %v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1)
@@ -80,9 +79,9 @@ define <16 x float> @concat_sqrt_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1)
 ;
 ; AVX512-LABEL: concat_sqrt_v16f32_v8f32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vsqrtps %ymm0, %ymm0
-; AVX512-NEXT:    vsqrtps %ymm1, %ymm1
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vsqrtps %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %v0 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0)
   %v1 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a1)

@RKSimon RKSimon enabled auto-merge (squash) December 1, 2025 12:30
@RKSimon RKSimon merged commit 05ad840 into llvm:main Dec 1, 2025
11 of 12 checks passed
@RKSimon RKSimon deleted the x86-concat-sqrt branch December 1, 2025 12:52
aahrun pushed a commit to aahrun/llvm-project that referenced this pull request Dec 1, 2025
… together (llvm#170113)

Similar to fdiv, we should be trying to concat these high latency instructions together
augusto2112 pushed a commit to augusto2112/llvm-project that referenced this pull request Dec 3, 2025
… together (llvm#170113)

Similar to fdiv, we should be trying to concat these high latency instructions together
kcloudy0717 pushed a commit to kcloudy0717/llvm-project that referenced this pull request Dec 4, 2025
… together (llvm#170113)

Similar to fdiv, we should be trying to concat these high latency instructions together
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants