Skip to content

Conversation

@stomfaig
Copy link
Contributor

Due to a previous PR (#171227), operations like _mm_ceil_sd compile to suboptimal assembly:

roundsd xmm1, xmm1, 10
blendpd xmm0, xmm1, 1

This PR introduces a rewrite pattern to mitigate this, and fuse the corresponding operations.

However, note that since ROUNDSSri_INT is defined via:

https://github.com/llvm/llvm-project/blob/26ff16663777fc995e8c6b46fa2433610dab4f64/llvm/lib/Target/X86/X86InstrSSE.td#L5692C1-L5694C63

in some cases we still end up with two instructions like (see diff also):

roundsd $9, %xmm0, %xmm1
movapd %xmm1, %xmm0

I propose rewriting the definition of ROUNDSSri_INT (or adding another record, maybe ROUNDSS_rri_INT) to

defm ROUND  : sse41_fp_unop_s_int<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
                                  v4f32, v2f64, X86RndScales, 0>, VVVV;

but I would like to discuss this first before implementing it.

@stomfaig stomfaig marked this pull request as draft December 12, 2025 17:54
@llvmbot
Copy link
Member

llvmbot commented Dec 12, 2025

@llvm/pr-subscribers-backend-x86

Author: Gergo Stomfai (stomfaig)

Changes

Due to a previous PR (#171227), operations like _mm_ceil_sd compile to suboptimal assembly:

roundsd xmm1, xmm1, 10
blendpd xmm0, xmm1, 1

This PR introduces a rewrite pattern to mitigate this, and fuse the corresponding operations.

However, note that since ROUNDSSri_INT is defined via:

https://github.com/llvm/llvm-project/blob/26ff16663777fc995e8c6b46fa2433610dab4f64/llvm/lib/Target/X86/X86InstrSSE.td#L5692C1-L5694C63

in some cases we still end up with two instructions like (see diff also):

roundsd $9, %xmm0, %xmm1
movapd %xmm1, %xmm0

I propose rewriting the definition of ROUNDSSri_INT (or adding another record, maybe ROUNDSS_rri_INT) to

defm ROUND  : sse41_fp_unop_s_int&lt;0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
                                  v4f32, v2f64, X86RndScales, 0&gt;, VVVV;

but I would like to discuss this first before implementing it.


Full diff: https://github.com/llvm/llvm-project/pull/172056.diff

2 Files Affected:

  • (modified) llvm/lib/Target/X86/X86InstrSSE.td (+25)
  • (modified) llvm/test/CodeGen/X86/vec_floor.ll (+12-16)
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index e4aaa1e1b594a..6c6e8386e4b58 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5707,6 +5707,31 @@ let Predicates = [UseSSE41, OptForSize] in {
             (ROUNDSDmi addr:$src1, timm:$src2)>;
 }
 
+multiclass scalar_unary_math_patterns_with_immediate<
+    SDPatternOperator OpNode, string OpcPrefix, SDNode Move, ValueType VT> {
+  let Predicates = [UseSSE41] in {
+    def : Pat<(VT(Move VT:$dst, (scalar_to_vector(OpNode
+                                    (extractelt VT:$src, (i64 0)),
+                                    i32:$imm)))),
+              (!cast<Instruction>(OpcPrefix#ri_Int) VT:$dst, VT:$src,
+                  i32:$imm)>;
+  }
+
+  // Repeat for AVX versions of the instructions.
+  let Predicates = [UseAVX] in {
+    def : Pat<(VT(Move VT:$dst, (scalar_to_vector(OpNode
+                                    (extractelt VT:$src, (i64 0)),
+                                    i32:$imm)))),
+              (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src,
+                  i32:$imm)>;
+  }
+}
+
+defm : scalar_unary_math_patterns_with_immediate<X86any_VRndScale, "ROUNDSS",
+                                                 X86Movss, v4f32>;
+defm : scalar_unary_math_patterns_with_immediate<X86any_VRndScale, "ROUNDSD",
+                                                 X86Movsd, v2f64>;
+
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Packed Bit Test
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index 7f4ed3394d10d..ffe493cdac1a8 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -821,14 +821,13 @@ define <4 x float> @const_trunc_v4f32() {
 define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind {
 ; SSE41-LABEL: floor_ss:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    roundss $9, %xmm0, %xmm0
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    roundss $9, %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: floor_ss:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vroundss $9, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_ss:
@@ -846,14 +845,13 @@ declare float @llvm.floor.f32(float %s)
 define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind {
 ; SSE41-LABEL: floor_sd:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    roundsd $9, %xmm0, %xmm0
-; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    roundsd $9, %xmm0, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: floor_sd:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vroundsd $9, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_sd:
@@ -1811,14 +1809,13 @@ define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) noun
 define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind {
 ; SSE41-LABEL: ceil_ss:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    roundss $10, %xmm0, %xmm0
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    roundss $10, %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: ceil_ss:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vroundss $10, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_ss:
@@ -1836,14 +1833,13 @@ declare float @llvm.ceil.f32(float %s)
 define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind {
 ; SSE41-LABEL: ceil_sd:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    roundsd $10, %xmm0, %xmm0
-; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    roundsd $10, %xmm0, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: ceil_sd:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vroundsd $10, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_sd:

@stomfaig
Copy link
Contributor Author

cc: @phoebewang @RKSimon

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants