-
Notifications
You must be signed in to change notification settings - Fork 15.7k
[AArch64] Add scal_to_vec patterns for SIMD convert intrinsics #172837
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-aarch64 Author: None (Lukacma) ChangesThis patch adds patterns for lowering FCVT intrinsics followed by scalar_to_vector node into SIMD FCVT instructions. This is done to prevent extra moves from being generated when GPR version would be used. Patch is 77.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/172837.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c22929f379dfc..447fd9ef66343 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6563,12 +6563,19 @@ multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> {
(!cast<Instruction>(INST # DHr) FPR16:$Rn)>;
def : Pat<(f64 (bitconvert (i64 (OpN (f32 FPR32:$Rn))))),
(!cast<Instruction>(INST # DSr) FPR32:$Rn)>;
+
+ def : Pat<(v1i64 (scalar_to_vector (i64 (OpN (f16 FPR16:$Rn))))),
+ (!cast<Instruction>(INST # DHr) FPR16:$Rn)>;
+ def : Pat<(v1i64 (scalar_to_vector (i64 (OpN (f32 FPR32:$Rn))))),
+ (!cast<Instruction>(INST # DSr) FPR32:$Rn)>;
}
def : Pat<(f32 (bitconvert (i32 (OpN (f32 FPR32:$Rn))))),
(!cast<Instruction>(INST # v1i32) FPR32:$Rn)>;
def : Pat<(f64 (bitconvert (i64 (OpN (f64 FPR64:$Rn))))),
(!cast<Instruction>(INST # v1i64) FPR64:$Rn)>;
-
+
+ def : Pat<(v1i64 (scalar_to_vector (i64 (OpN (f64 FPR64:$Rn))))),
+ (!cast<Instruction>(INST # v1i64) FPR64:$Rn)>;
}
defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtas, "FCVTAS">;
defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtau, "FCVTAU">;
@@ -6611,12 +6618,20 @@ multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
(!cast<Instruction>(INST # DSr) $Rn)>;
def : Pat<(f32 (bitconvert (i32 (round f64:$Rn)))),
(!cast<Instruction>(INST # SDr) $Rn)>;
+
+ def : Pat<(v1i64 (scalar_to_vector (i64 (round f16:$Rn)))),
+ (!cast<Instruction>(INST # DHr) $Rn)>;
+ def : Pat<(v1i64 (scalar_to_vector (i64 (round f32:$Rn)))),
+ (!cast<Instruction>(INST # DSr) $Rn)>;
}
def : Pat<(f32 (bitconvert (i32 (round f32:$Rn)))),
(!cast<Instruction>(INST # v1i32) $Rn)>;
def : Pat<(f64 (bitconvert (i64 (round f64:$Rn)))),
(!cast<Instruction>(INST # v1i64) $Rn)>;
+ def : Pat<(v1i64 (scalar_to_vector (i64 (round f64:$Rn)))),
+ (!cast<Instruction>(INST # v1i64) $Rn)>;
+
let Predicates = [HasFullFP16] in {
def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
(!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
diff --git a/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll
index a729772f2897a..ebaca00d2cdb9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll
@@ -15,6 +15,10 @@
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f64_simd
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f64_simd
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtzs_scalar_to_vector_h_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtzs_scalar_to_vector_s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtzu_scalar_to_vector_h_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtzu_scalar_to_vector_s_strict
;
; FPTOI
@@ -1941,3 +1945,169 @@ define double @fcvtzu_dd_simd(double %a) {
%bc = bitcast i64 %i to double
ret double %bc
}
+
+;
+; FPTOI scalar_to_vector
+;
+
+define <1 x i64> @fcvtzs_scalar_to_vector_h(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_scalar_to_vector_h:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_scalar_to_vector_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, h0
+; CHECK-NEXT: ret
+ %val = fptosi half %a to i64
+ %vec = insertelement <1 x i64> poison, i64 %val, i32 0
+ ret <1 x i64> %vec
+}
+
+define <1 x i64> @fcvtzs_scalar_to_vector_s(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_scalar_to_vector_s:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_scalar_to_vector_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: ret
+ %val = fptosi float %a to i64
+ %vec = insertelement <1 x i64> poison, i64 %val, i32 0
+ ret <1 x i64> %vec
+}
+
+define <1 x i64> @fcvtzs_scalar_to_vector_d(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_scalar_to_vector_d:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_scalar_to_vector_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %val = fptosi double %a to i64
+ %vec = insertelement <1 x i64> poison, i64 %val, i32 0
+ ret <1 x i64> %vec
+}
+
+define <1 x i64> @fcvtzu_scalar_to_vector_h(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_scalar_to_vector_h:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_scalar_to_vector_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, h0
+; CHECK-NEXT: ret
+ %val = fptoui half %a to i64
+ %vec = insertelement <1 x i64> poison, i64 %val, i32 0
+ ret <1 x i64> %vec
+}
+
+define <1 x i64> @fcvtzu_scalar_to_vector_s(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_scalar_to_vector_s:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_scalar_to_vector_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, s0
+; CHECK-NEXT: ret
+ %val = fptoui float %a to i64
+ %vec = insertelement <1 x i64> poison, i64 %val, i32 0
+ ret <1 x i64> %vec
+}
+
+define <1 x i64> @fcvtzu_scalar_to_vector_d(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_scalar_to_vector_d:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_scalar_to_vector_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: ret
+ %val = fptoui double %a to i64
+ %vec = insertelement <1 x i64> poison, i64 %val, i32 0
+ ret <1 x i64> %vec
+}
+
+;
+; FPTOI scalar_to_vector strictfp
+;
+
+define <1 x i64> @fcvtzs_scalar_to_vector_h_strict(half %x) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_scalar_to_vector_h_strict:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_scalar_to_vector_h_strict:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, h0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptosi.i64.f16(half %x, metadata !"fpexcept.strict")
+ %vec = insertelement <1 x i64> poison, i64 %val, i32 0
+ ret <1 x i64> %vec
+}
+
+define <1 x i64> @fcvtzs_scalar_to_vector_s_strict(float %x) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_scalar_to_vector_s_strict:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_scalar_to_vector_s_strict:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptosi.i64.f32(float %x, metadata !"fpexcept.strict")
+ %vec = insertelement <1 x i64> poison, i64 %val, i32 0
+ ret <1 x i64> %vec
+}
+
+define <1 x i64> @fcvtzu_scalar_to_vector_h_strict(half %x) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_scalar_to_vector_h_strict:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_scalar_to_vector_h_strict:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, h0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptoui.i64.f16(half %x, metadata !"fpexcept.strict")
+ %vec = insertelement <1 x i64> poison, i64 %val, i32 0
+ ret <1 x i64> %vec
+}
+
+define <1 x i64> @fcvtzu_scalar_to_vector_s_strict(float %x) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_scalar_to_vector_s_strict:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_scalar_to_vector_s_strict:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, s0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptoui.i64.f32(float %x, metadata !"fpexcept.strict")
+ %vec = insertelement <1 x i64> poison, i64 %val, i32 0
+ ret <1 x i64> %vec
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.s b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.s
new file mode 100644
index 0000000000000..0850b306e8c79
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.s
@@ -0,0 +1,1515 @@
+ .file "arm64-cvt-simd-fptoi.ll"
+ .text
+ .globl test_fptosi_f16_i32_simd // -- Begin function test_fptosi_f16_i32_simd
+ .p2align 2
+ .type test_fptosi_f16_i32_simd,@function
+test_fptosi_f16_i32_simd: // @test_fptosi_f16_i32_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzs s0, h0
+ ret
+.Lfunc_end0:
+ .size test_fptosi_f16_i32_simd, .Lfunc_end0-test_fptosi_f16_i32_simd
+ .cfi_endproc
+ // -- End function
+ .globl test_fptosi_f16_i64_simd // -- Begin function test_fptosi_f16_i64_simd
+ .p2align 2
+ .type test_fptosi_f16_i64_simd,@function
+test_fptosi_f16_i64_simd: // @test_fptosi_f16_i64_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzs d0, h0
+ ret
+.Lfunc_end1:
+ .size test_fptosi_f16_i64_simd, .Lfunc_end1-test_fptosi_f16_i64_simd
+ .cfi_endproc
+ // -- End function
+ .globl test_fptosi_f64_i32_simd // -- Begin function test_fptosi_f64_i32_simd
+ .p2align 2
+ .type test_fptosi_f64_i32_simd,@function
+test_fptosi_f64_i32_simd: // @test_fptosi_f64_i32_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzs s0, d0
+ ret
+.Lfunc_end2:
+ .size test_fptosi_f64_i32_simd, .Lfunc_end2-test_fptosi_f64_i32_simd
+ .cfi_endproc
+ // -- End function
+ .globl test_fptosi_f32_i64_simd // -- Begin function test_fptosi_f32_i64_simd
+ .p2align 2
+ .type test_fptosi_f32_i64_simd,@function
+test_fptosi_f32_i64_simd: // @test_fptosi_f32_i64_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzs d0, s0
+ ret
+.Lfunc_end3:
+ .size test_fptosi_f32_i64_simd, .Lfunc_end3-test_fptosi_f32_i64_simd
+ .cfi_endproc
+ // -- End function
+ .globl test_fptosi_f64_i64_simd // -- Begin function test_fptosi_f64_i64_simd
+ .p2align 2
+ .type test_fptosi_f64_i64_simd,@function
+test_fptosi_f64_i64_simd: // @test_fptosi_f64_i64_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzs d0, d0
+ ret
+.Lfunc_end4:
+ .size test_fptosi_f64_i64_simd, .Lfunc_end4-test_fptosi_f64_i64_simd
+ .cfi_endproc
+ // -- End function
+ .globl test_fptosi_f32_i32_simd // -- Begin function test_fptosi_f32_i32_simd
+ .p2align 2
+ .type test_fptosi_f32_i32_simd,@function
+test_fptosi_f32_i32_simd: // @test_fptosi_f32_i32_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzs s0, s0
+ ret
+.Lfunc_end5:
+ .size test_fptosi_f32_i32_simd, .Lfunc_end5-test_fptosi_f32_i32_simd
+ .cfi_endproc
+ // -- End function
+ .globl test_fptoui_f16_i32_simd // -- Begin function test_fptoui_f16_i32_simd
+ .p2align 2
+ .type test_fptoui_f16_i32_simd,@function
+test_fptoui_f16_i32_simd: // @test_fptoui_f16_i32_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzu s0, h0
+ ret
+.Lfunc_end6:
+ .size test_fptoui_f16_i32_simd, .Lfunc_end6-test_fptoui_f16_i32_simd
+ .cfi_endproc
+ // -- End function
+ .globl test_fptoui_f16_i64_simd // -- Begin function test_fptoui_f16_i64_simd
+ .p2align 2
+ .type test_fptoui_f16_i64_simd,@function
+test_fptoui_f16_i64_simd: // @test_fptoui_f16_i64_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzu d0, h0
+ ret
+.Lfunc_end7:
+ .size test_fptoui_f16_i64_simd, .Lfunc_end7-test_fptoui_f16_i64_simd
+ .cfi_endproc
+ // -- End function
+ .globl test_fptoui_f64_i32_simd // -- Begin function test_fptoui_f64_i32_simd
+ .p2align 2
+ .type test_fptoui_f64_i32_simd,@function
+test_fptoui_f64_i32_simd: // @test_fptoui_f64_i32_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzu s0, d0
+ ret
+.Lfunc_end8:
+ .size test_fptoui_f64_i32_simd, .Lfunc_end8-test_fptoui_f64_i32_simd
+ .cfi_endproc
+ // -- End function
+ .globl test_fptoui_f32_i64_simd // -- Begin function test_fptoui_f32_i64_simd
+ .p2align 2
+ .type test_fptoui_f32_i64_simd,@function
+test_fptoui_f32_i64_simd: // @test_fptoui_f32_i64_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzu d0, s0
+ ret
+.Lfunc_end9:
+ .size test_fptoui_f32_i64_simd, .Lfunc_end9-test_fptoui_f32_i64_simd
+ .cfi_endproc
+ // -- End function
+ .globl test_fptoui_f64_i64_simd // -- Begin function test_fptoui_f64_i64_simd
+ .p2align 2
+ .type test_fptoui_f64_i64_simd,@function
+test_fptoui_f64_i64_simd: // @test_fptoui_f64_i64_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzu d0, d0
+ ret
+.Lfunc_end10:
+ .size test_fptoui_f64_i64_simd, .Lfunc_end10-test_fptoui_f64_i64_simd
+ .cfi_endproc
+ // -- End function
+ .globl test_fptoui_f32_i32_simd // -- Begin function test_fptoui_f32_i32_simd
+ .p2align 2
+ .type test_fptoui_f32_i32_simd,@function
+test_fptoui_f32_i32_simd: // @test_fptoui_f32_i32_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzu s0, s0
+ ret
+.Lfunc_end11:
+ .size test_fptoui_f32_i32_simd, .Lfunc_end11-test_fptoui_f32_i32_simd
+ .cfi_endproc
+ // -- End function
+ .globl fptosi_i32_f16_simd // -- Begin function fptosi_i32_f16_simd
+ .p2align 2
+ .type fptosi_i32_f16_simd,@function
+fptosi_i32_f16_simd: // @fptosi_i32_f16_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzs s0, h0
+ ret
+.Lfunc_end12:
+ .size fptosi_i32_f16_simd, .Lfunc_end12-fptosi_i32_f16_simd
+ .cfi_endproc
+ // -- End function
+ .globl fptosi_i64_f16_simd // -- Begin function fptosi_i64_f16_simd
+ .p2align 2
+ .type fptosi_i64_f16_simd,@function
+fptosi_i64_f16_simd: // @fptosi_i64_f16_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzs d0, h0
+ ret
+.Lfunc_end13:
+ .size fptosi_i64_f16_simd, .Lfunc_end13-fptosi_i64_f16_simd
+ .cfi_endproc
+ // -- End function
+ .globl fptosi_i64_f32_simd // -- Begin function fptosi_i64_f32_simd
+ .p2align 2
+ .type fptosi_i64_f32_simd,@function
+fptosi_i64_f32_simd: // @fptosi_i64_f32_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzs d0, s0
+ ret
+.Lfunc_end14:
+ .size fptosi_i64_f32_simd, .Lfunc_end14-fptosi_i64_f32_simd
+ .cfi_endproc
+ // -- End function
+ .globl fptosi_i32_f64_simd // -- Begin function fptosi_i32_f64_simd
+ .p2align 2
+ .type fptosi_i32_f64_simd,@function
+fptosi_i32_f64_simd: // @fptosi_i32_f64_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzs s0, d0
+ ret
+.Lfunc_end15:
+ .size fptosi_i32_f64_simd, .Lfunc_end15-fptosi_i32_f64_simd
+ .cfi_endproc
+ // -- End function
+ .globl fptosi_i64_f64_simd // -- Begin function fptosi_i64_f64_simd
+ .p2align 2
+ .type fptosi_i64_f64_simd,@function
+fptosi_i64_f64_simd: // @fptosi_i64_f64_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzs d0, d0
+ ret
+.Lfunc_end16:
+ .size fptosi_i64_f64_simd, .Lfunc_end16-fptosi_i64_f64_simd
+ .cfi_endproc
+ // -- End function
+ .globl fptosi_i32_f32_simd // -- Begin function fptosi_i32_f32_simd
+ .p2align 2
+ .type fptosi_i32_f32_simd,@function
+fptosi_i32_f32_simd: // @fptosi_i32_f32_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzs s0, s0
+ ret
+.Lfunc_end17:
+ .size fptosi_i32_f32_simd, .Lfunc_end17-fptosi_i32_f32_simd
+ .cfi_endproc
+ // -- End function
+ .globl fptoui_i32_f16_simd // -- Begin function fptoui_i32_f16_simd
+ .p2align 2
+ .type fptoui_i32_f16_simd,@function
+fptoui_i32_f16_simd: // @fptoui_i32_f16_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzu s0, h0
+ ret
+.Lfunc_end18:
+ .size fptoui_i32_f16_simd, .Lfunc_end18-fptoui_i32_f16_simd
+ .cfi_endproc
+ // -- End function
+ .globl fptoui_i64_f16_simd // -- Begin function fptoui_i64_f16_simd
+ .p2align 2
+ .type fptoui_i64_f16_simd,@function
+fptoui_i64_f16_simd: // @fptoui_i64_f16_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzu d0, h0
+ ret
+.Lfunc_end19:
+ .size fptoui_i64_f16_simd, .Lfunc_end19-fptoui_i64_f16_simd
+ .cfi_endproc
+ // -- End function
+ .globl fptoui_i64_f32_simd // -- Begin function fptoui_i64_f32_simd
+ .p2align 2
+ .type fptoui_i64_f32_simd,@function
+fptoui_i64_f32_simd: // @fptoui_i64_f32_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzu d0, s0
+ ret
+.Lfunc_end20:
+ .size fptoui_i64_f32_simd, .Lfunc_end20-fptoui_i64_f32_simd
+ .cfi_endproc
+ // -- End function
+ .globl fptoui_i32_f64_simd // -- Begin function fptoui_i32_f64_simd
+ .p2align 2
+ .type fptoui_i32_f64_simd,@function
+fptoui_i32_f64_simd: // @fptoui_i32_f64_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzu s0, d0
+ ret
+.Lfunc_end21:
+ .size fptoui_i32_f64_simd, .Lfunc_end21-fptoui_i32_f64_simd
+ .cfi_endproc
+ // -- End function
+ .globl fptoui_i64_f64_simd // -- Begin function fptoui_i64_f64_simd
+ .p2align 2
+ .type fptoui_i64_f64_simd,@function
+fptoui_i64_f64_simd: // @fptoui_i64_f64_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzu d0, d0
+ ret
+.Lfunc_end22:
+ .size fptoui_i64_f64_simd, .Lfunc_end22-fptoui_i64_f64_simd
+ .cfi_endproc
+ // -- End function
+ .globl fptoui_i32_f32_simd // -- Begin function fptoui_i32_f32_simd
+ .p2align 2
+ .type fptoui_i32_f32_simd,@function
+fptoui_i32_f32_simd: // @fptoui_i32_f32_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtzu s0, s0
+ ret
+.Lfunc_end23:
+ .size fptoui_i32_f32_simd, .Lfunc_end23-fptoui_i32_f32_simd
+ .cfi_endproc
+ // -- End function
+ .globl fcvtas_ds_round_simd // -- Begin function fcvtas_ds_round_simd
+ .p2align 2
+ .type fcvtas_ds_round_simd,@function
+fcvtas_ds_round_simd: // @fcvtas_ds_round_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtas d0, s0
+ ret
+.Lfunc_end24:
+ .size fcvtas_ds_round_simd, .Lfunc_end24-fcvtas_ds_round_simd
+ .cfi_endproc
+ // -- End function
+ .globl fcvtas_sd_round_simd // -- Begin function fcvtas_sd_round_simd
+ .p2align 2
+ .type fcvtas_sd_round_simd,@function
+fcvtas_sd_round_simd: // @fcvtas_sd_round_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtas s0, d0
+ ret
+.Lfunc_end25:
+ .size fcvtas_sd_round_simd, .Lfunc_end25-fcvtas_sd_round_simd
+ .cfi_endproc
+ // -- End function
+ .globl fcvtas_ss_round_simd // -- Begin function fcvtas_ss_round_simd
+ .p2align 2
+ .type fcvtas_ss_round_simd,@function
+fcvtas_ss_round_simd: // @fcvtas_ss_round_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtas s0, s0
+ ret
+.Lfunc_end26:
+ .size fcvtas_ss_round_simd, .Lfunc_end26-fcvtas_ss_round_simd
+ .cfi_endproc
+ // -- End function
+ .globl fcvtas_dd_round_simd // -- Begin function fcvtas_dd_round_simd
+ .p2align 2
+ .type fcvtas_dd_round_simd,@function
+fcvtas_dd_round_simd: // @fcvtas_dd_round_simd
+ .cfi_startproc
+// %bb.0:
+ fcvtas d0, d0
+ ret
+.Lfunc_end27:
+ .size fcvtas_dd_round_sim...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull request overview
This pull request optimizes floating-point conversion operations for SIMD instructions on AArch64. The optimization eliminates unnecessary register moves between GPR and FPR registers when converting floating-point values to integers and then moving them to vector registers. The patch adds patterns for lowering FCVT intrinsics followed by scalar_to_vector nodes directly into SIMD FCVT instructions.
- Adds patterns to emit direct SIMD FCVT instructions instead of GPR conversions followed by register moves
- Updates test expectations to reflect the optimized instruction sequences (removes extra fmov instructions)
- Covers both standard and strict floating-point conversion intrinsics
Reviewed changes
Copilot reviewed 9 out of 9 changed files in this pull request and generated 18 comments.
Show a summary per file
| File | Description |
|---|---|
| llvm/lib/Target/AArch64/AArch64InstrInfo.td | Adds instruction selection patterns for scalar_to_vector following FCVT operations |
| llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll | Updates test expectations to verify SIMD FCVT instructions are used |
| llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll | Updates test expectations for constrained FP conversions |
| llvm/test/CodeGen/AArch64/arm64-vcvt.ll | Removes FIXME comments and updates tests for improved code generation |
| llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | Updates test expectations for bitcast and FCVT sequences |
| llvm/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll | Updates test expectations for fixed-point conversion optimizations |
| llvm/test/CodeGen/AArch64/arm64-cvt-simd-intrinsics.ll | Adds comprehensive test coverage for new scalar_to_vector patterns with various FCVT intrinsics |
| llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.s | Adds assembly test file for generated SIMD conversion instructions |
| llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll | Adds LLVM IR test coverage for scalar_to_vector patterns with fptosi/fptoui and strict variants |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
| } | ||
|
|
||
| ; | ||
| ; Intriniscs (scalar_to_vector) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| ; Intriniscs (scalar_to_vector) | |
| ; Intrinsics (scalar_to_vector) |
| ret <2 x i64> %tmp3 | ||
| } | ||
|
|
||
| ; FIXME: Generate "fcvtzs d0, d0"? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can these FIXMEs be removed now?
| } | ||
|
|
||
|
|
||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: extra lines between tests
| ; CHECK: // %bb.0: | ||
| ; CHECK-NEXT: fcvtms d0, h0 | ||
| ; CHECK-NEXT: ret | ||
| %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtms.i64.f16(half %a) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtms.i64.f16(half %a) | |
| %vcvtmh_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtms.i64.f16(half %a) |
| ; CHECK: // %bb.0: | ||
| ; CHECK-NEXT: fcvtas d0, s0 | ||
| ; CHECK-NEXT: ret | ||
| %i = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float %A) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it worth updating the tests with %i to something like %vcvtah_s64_f32 to match the others?
This patch adds patterns for lowering FCVT intrinsics followed by scalar_to_vector node into SIMD FCVT instructions. This is done to prevent extra moves from being generated when GPR version would be used.