-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AArch64][GlobalISel] Add codegen for simd fpcvt intrinsics #157680
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: None (Lukacma) ChangesThis patch is a first in a series of patches that add codegen support for fcvt instructions that keep the result in 32-bit or 64-bit SIMD&FP registers. For a long time, LLVM primarily generated fpcvt instructions, which store the result in GPRs, resulting in extra moves when the value was used by NEON instructions that operate on SIMD&FP registers. Although patterns existed for generating the SIMD variants, they relied on single-element vector types (such as v1i32 or v1i64) to decide whether the SIMD variant should be selected. This was not useful, because many NEON intrinsics and other LLVM IR operations use scalar types (i32/i64) even though they expect the result to be stored in SIMD&FP registers. This patch is part of a series that addresses this and also adds support for generating these instructions in GlobalISel. To fix this in SelectionDAG, bitcasts of the result to a floating-point type serve as a hint that the SIMD variant of the conversion should be used, rather than relying on single-element vector types. These bitcasts are not currently generated by LLVM, but the goal is to add explicit bitcasts to the inputs and outputs of NEON intrinsics operating on integers in follow-up patches. For GlobalISel, the register bank selection algorithm is used to determine which variant to generate Patch is 30.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157680.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 8958ad129269c..74e8b98d7a47a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5299,28 +5299,29 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
}
}
-multiclass FPToIntegerSIMDScalar<bits<2> rmode, bits<3> opcode, string asm> {
+multiclass FPToIntegerSIMDScalar<bits<2> rmode, bits<3> opcode, string asm,
+ SDPatternOperator OpN = null_frag> {
// double-precision to 32-bit SIMD/FPR
def SDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, FPR32, asm,
- []> {
+ [(set FPR32:$Rd, (i32 (OpN (f64 FPR64:$Rn))))]> {
let Inst{31} = 0; // 32-bit FPR flag
}
// half-precision to 32-bit SIMD/FPR
def SHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, FPR32, asm,
- []> {
+ [(set FPR32:$Rd, (i32 (OpN (f16 FPR16:$Rn))))]> {
let Inst{31} = 0; // 32-bit FPR flag
}
// half-precision to 64-bit SIMD/FPR
def DHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, FPR64, asm,
- []> {
+ [(set FPR64:$Rd, (i64 (OpN (f16 FPR16:$Rn))))]> {
let Inst{31} = 1; // 64-bit FPR flag
}
// single-precision to 64-bit SIMD/FPR
def DSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, FPR64, asm,
- []> {
+ [(set FPR64:$Rd, (i64 (OpN (f32 FPR32:$Rn))))]> {
let Inst{31} = 1; // 64-bit FPR flag
}
}
@@ -7949,6 +7950,21 @@ multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
}
}
+let mayRaiseFPException = 1, Uses = [FPCR] in
+multiclass SIMDFPTwoScalarFCVT<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpN = null_frag> {
+ let Predicates = [HasNEONandIsStreamingSafe], FastISelShouldIgnore = 1 in {
+ def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
+ [(set (i64 FPR64:$Rd), (OpN (f64 FPR64:$Rn)))]>;
+ def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
+ [(set FPR32:$Rd, (i32 (OpN (f32 FPR32:$Rn))))]>;
+ }
+ let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
+ def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
+ [(set FPR16:$Rd, (i16 (OpN (f16 FPR16:$Rn))))]>;
+ }
+}
+
let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 62b26b5239365..b23f7a58ee4c0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5212,19 +5212,54 @@ defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>;
defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
+defm FCVTAS : SIMDFPTwoScalarFCVT< 0, 0, 0b11100, "fcvtas", int_aarch64_neon_fcvtas>;
+defm FCVTAU : SIMDFPTwoScalarFCVT< 1, 0, 0b11100, "fcvtau", int_aarch64_neon_fcvtau>;
+defm FCVTMS : SIMDFPTwoScalarFCVT< 0, 0, 0b11011, "fcvtms", int_aarch64_neon_fcvtms>;
+defm FCVTMU : SIMDFPTwoScalarFCVT< 1, 0, 0b11011, "fcvtmu", int_aarch64_neon_fcvtmu>;
+defm FCVTNS : SIMDFPTwoScalarFCVT< 0, 0, 0b11010, "fcvtns", int_aarch64_neon_fcvtns>;
+defm FCVTNU : SIMDFPTwoScalarFCVT< 1, 0, 0b11010, "fcvtnu", int_aarch64_neon_fcvtnu>;
+defm FCVTPS : SIMDFPTwoScalarFCVT< 0, 1, 0b11010, "fcvtps", int_aarch64_neon_fcvtps>;
+defm FCVTPU : SIMDFPTwoScalarFCVT< 1, 1, 0b11010, "fcvtpu", int_aarch64_neon_fcvtpu>;
+defm FCVTZS : SIMDFPTwoScalarFCVT< 0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDFPTwoScalarFCVT< 1, 1, 0b11011, "fcvtzu">;
+
let Predicates = [HasNEON, HasFPRCVT] in{
- defm FCVTAS : FPToIntegerSIMDScalar<0b11, 0b010, "fcvtas">;
- defm FCVTAU : FPToIntegerSIMDScalar<0b11, 0b011, "fcvtau">;
- defm FCVTMS : FPToIntegerSIMDScalar<0b10, 0b100, "fcvtms">;
- defm FCVTMU : FPToIntegerSIMDScalar<0b10, 0b101, "fcvtmu">;
- defm FCVTNS : FPToIntegerSIMDScalar<0b01, 0b010, "fcvtns">;
- defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu">;
- defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps">;
- defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu">;
+ defm FCVTAS : FPToIntegerSIMDScalar<0b11, 0b010, "fcvtas", int_aarch64_neon_fcvtas>;
+ defm FCVTAU : FPToIntegerSIMDScalar<0b11, 0b011, "fcvtau", int_aarch64_neon_fcvtau>;
+ defm FCVTMS : FPToIntegerSIMDScalar<0b10, 0b100, "fcvtms", int_aarch64_neon_fcvtms>;
+ defm FCVTMU : FPToIntegerSIMDScalar<0b10, 0b101, "fcvtmu", int_aarch64_neon_fcvtmu>;
+ defm FCVTNS : FPToIntegerSIMDScalar<0b01, 0b010, "fcvtns", int_aarch64_neon_fcvtns>;
+ defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu", int_aarch64_neon_fcvtnu>;
+ defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps", int_aarch64_neon_fcvtps>;
+ defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu", int_aarch64_neon_fcvtpu>;
defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs">;
defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu">;
}
+multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> {
+ def : Pat<(f32 (bitconvert (i32 (OpN (f64 FPR64:$Rn))))),
+ (!cast<Instruction>(INST # SDr) FPR64:$Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (OpN (f16 FPR16:$Rn))))),
+ (!cast<Instruction>(INST # SHr) FPR16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (OpN (f16 FPR16:$Rn))))),
+ (!cast<Instruction>(INST # DHr) FPR16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (OpN (f32 FPR32:$Rn))))),
+ (!cast<Instruction>(INST # DSr) FPR32:$Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (OpN (f32 FPR32:$Rn))))),
+ (!cast<Instruction>(INST # v1i32) FPR32:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (OpN (f64 FPR64:$Rn))))),
+ (!cast<Instruction>(INST # v1i64) FPR64:$Rn)>;
+
+}
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtas, "FCVTAS">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtau, "FCVTAU">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtms, "FCVTMS">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtmu, "FCVTMU">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtns, "FCVTNS">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtnu, "FCVTNU">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtps, "FCVTPS">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtpu, "FCVTPU">;
+
// AArch64's FCVT instructions saturate when out of range.
multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> {
let Predicates = [HasFullFP16] in {
@@ -5301,6 +5336,32 @@ multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>;
def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>;
+ // For global-isel we can use register classes to determine
+ // which FCVT instruction to use.
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # SHr) $Rn)>;
+ def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # DHr) $Rn)>;
+ def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # DSr) $Rn)>;
+ def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # SDr) $Rn)>;
+ }
+ def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # v1i32) $Rn)>;
+ def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # v1i64) $Rn)>;
+
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(f32 (bitconvert (i32 (round f16:$Rn)))),
+ (!cast<Instruction>(INST # SHr) $Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (round f16:$Rn)))),
+ (!cast<Instruction>(INST # DHr) $Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (round f32:$Rn)))),
+ (!cast<Instruction>(INST # DSr) $Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (round f64:$Rn)))),
+ (!cast<Instruction>(INST # SDr) $Rn)>;
+ }
+ def : Pat<(f32 (bitconvert (i32 (round f32:$Rn)))),
+ (!cast<Instruction>(INST # v1i32) $Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (round f64:$Rn)))),
+ (!cast<Instruction>(INST # v1i64) $Rn)>;
+
let Predicates = [HasFullFP16] in {
def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
(!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
@@ -6549,17 +6610,7 @@ defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
-defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">;
-defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">;
-defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">;
-defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">;
-defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">;
-defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">;
-defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">;
-defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">;
def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
-defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
-defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">;
defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">;
defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index cf391c446a955..42ea80a679cb7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -568,9 +568,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
case Intrinsic::aarch64_neon_fcvtnu:
case Intrinsic::aarch64_neon_fcvtps:
case Intrinsic::aarch64_neon_fcvtpu:
- // Force FPR register bank for half types, as those types otherwise
- // don't get legalized correctly resulting in fp16 <-> gpr32 COPY's.
- return MRI.getType(MI.getOperand(2).getReg()) == LLT::float16();
+ return true;
default:
break;
}
@@ -1143,6 +1141,34 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case TargetOpcode::G_INTRINSIC:
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: {
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
+ case Intrinsic::aarch64_neon_fcvtas:
+ case Intrinsic::aarch64_neon_fcvtau:
+ case Intrinsic::aarch64_neon_fcvtzs:
+ case Intrinsic::aarch64_neon_fcvtzu:
+ case Intrinsic::aarch64_neon_fcvtms:
+ case Intrinsic::aarch64_neon_fcvtmu:
+ case Intrinsic::aarch64_neon_fcvtns:
+ case Intrinsic::aarch64_neon_fcvtnu:
+ case Intrinsic::aarch64_neon_fcvtps:
+ case Intrinsic::aarch64_neon_fcvtpu: {
+ OpRegBankIdx[2] = PMI_FirstFPR;
+ if (MRI.getType(MI.getOperand(0).getReg()).isVector()) {
+ OpRegBankIdx[0] = PMI_FirstFPR;
+ break;
+ }
+ TypeSize DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+ TypeSize SrcSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, TRI);
+ if (((DstSize == SrcSize) || STI.hasFeature(AArch64::FeatureFPRCVT)) &&
+ all_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()),
+ [&](const MachineInstr &UseMI) {
+ return onlyUsesFP(UseMI, MRI, TRI) ||
+ prefersFPUse(UseMI, MRI, TRI);
+ }))
+ OpRegBankIdx[0] = PMI_FirstFPR;
+ else
+ OpRegBankIdx[0] = PMI_FirstGPR;
+ break;
+ }
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
case Intrinsic::aarch64_neon_vcvtfp2fxs:
diff --git a/llvm/test/CodeGen/AArch64/arm64-cvt-simd-intrinsics.ll b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-intrinsics.ll
new file mode 100644
index 0000000000000..ae4f83a5bd261
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-intrinsics.ll
@@ -0,0 +1,612 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fprcvt,+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -global-isel -global-isel-abort=2 -mattr=+fprcvt,+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+
+;
+; Intriniscs
+;
+
+define float @fcvtas_1s1d_simd(double %A) nounwind {
+; CHECK-LABEL: fcvtas_1s1d_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, d0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double %A)
+ %f = bitcast i32 %i to float
+ ret float %f
+}
+
+define double @fcvtas_1d1s_simd(float %A) nounwind {
+; CHECK-LABEL: fcvtas_1d1s_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, s0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float %A)
+ %d = bitcast i64 %i to double
+ ret double %d
+}
+
+define dso_local float @fcvtas_1s1h_simd(half %a) {
+; CHECK-LABEL: fcvtas_1s1h_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, h0
+; CHECK-NEXT: ret
+ %fcvt = tail call i32 @llvm.aarch64.neon.fcvtas.i32.f16(half %a)
+ %f = bitcast i32 %fcvt to float
+ ret float %f
+}
+
+define dso_local double @fcvtas_1d1h_simd(half %a) {
+; CHECK-LABEL: fcvtas_1d1h_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, h0
+; CHECK-NEXT: ret
+ %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtas.i64.f16(half %a)
+ %d = bitcast i64 %vcvtah_s64_f16 to double
+ ret double %d
+}
+
+define dso_local double @fcvtas_1d1d_simd(double %a) {
+; CHECK-LABEL: fcvtas_1d1d_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, d0
+; CHECK-NEXT: ret
+ %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %a)
+ %d = bitcast i64 %vcvtah_s64_f64 to double
+ ret double %d
+}
+
+define dso_local float @fcvtas_1s1s_simd(float %a) {
+; CHECK-LABEL: fcvtas_1s1s_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, s0
+; CHECK-NEXT: ret
+ %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %a)
+ %d = bitcast i32 %vcvtah_s32_f32 to float
+ ret float %d
+}
+
+
+define float @fcvtau_1s1d_simd(double %A) nounwind {
+; CHECK-LABEL: fcvtau_1s1d_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau s0, d0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double %A)
+ %f = bitcast i32 %i to float
+ ret float %f
+}
+
+define double @fcvtau_1d1s_simd(float %A) nounwind {
+; CHECK-LABEL: fcvtau_1d1s_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau d0, s0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float %A)
+ %d = bitcast i64 %i to double
+ ret double %d
+}
+
+define dso_local float @fcvtau_1s1h_simd(half %a) {
+; CHECK-LABEL: fcvtau_1s1h_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau s0, h0
+; CHECK-NEXT: ret
+ %fcvt = tail call i32 @llvm.aarch64.neon.fcvtau.i32.f16(half %a)
+ %f = bitcast i32 %fcvt to float
+ ret float %f
+}
+
+define dso_local double @fcvtau_1d1h_simd(half %a) {
+; CHECK-LABEL: fcvtau_1d1h_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau d0, h0
+; CHECK-NEXT: ret
+ %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtau.i64.f16(half %a)
+ %d = bitcast i64 %vcvtah_s64_f16 to double
+ ret double %d
+}
+
+define dso_local double @fcvtau_1d1d_simd(double %a) {
+; CHECK-LABEL: fcvtau_1d1d_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau d0, d0
+; CHECK-NEXT: ret
+ %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %a)
+ %d = bitcast i64 %vcvtah_s64_f64 to double
+ ret double %d
+}
+
+define dso_local float @fcvtau_1s1s_simd(float %a) {
+; CHECK-LABEL: fcvtau_1s1s_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau s0, s0
+; CHECK-NEXT: ret
+ %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %a)
+ %d = bitcast i32 %vcvtah_s32_f32 to float
+ ret float %d
+}
+
+define float @fcvtms_1s1d_simd(double %A) nounwind {
+; CHECK-LABEL: fcvtms_1s1d_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, d0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double %A)
+ %f = bitcast i32 %i to float
+ ret float %f
+}
+
+define double @fcvtms_1d1s_simd(float %A) nounwind {
+; CHECK-LABEL: fcvtms_1d1s_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, s0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float %A)
+ %d = bitcast i64 %i to double
+ ret double %d
+}
+
+define dso_local float @fcvtms_1s1h_simd(half %a) {
+; CHECK-LABEL: fcvtms_1s1h_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, h0
+; CHECK-NEXT: ret
+ %fcvt = tail call i32 @llvm.aarch64.neon.fcvtms.i32.f16(half %a)
+ %f = bitcast i32 %fcvt to float
+ ret float %f
+}
+
+define dso_local double @fcvtms_1d1h_simd(half %a) {
+; CHECK-LABEL: fcvtms_1d1h_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, h0
+; CHECK-NEXT: ret
+ %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtms.i64.f16(half %a)
+ %d = bitcast i64 %vcvtah_s64_f16 to double
+ ret double %d
+}
+
+define dso_local double @fcvtms_1d1d_simd(double %a) {
+; CHECK-LABEL: fcvtms_1d1d_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, d0
+; CHECK-NEXT: ret
+ %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %a)
+ %d = bitcast i64 %vcvtah_s64_f64 to double
+ ret double %d
+}
+
+define dso_local float @fcvtms_1s1s_simd(float %a) {
+; CHECK-LABEL: fcvtms_1s1s_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, s0
+; CHECK-NEXT: ret
+ %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %a)
+ %d = bitcast i32 %vcvtah_s32_f32 to float
+ ret float %d
+}
+
+define float @fcvtmu_1s1d_simd(double %A) nounwind {
+; CHECK-LABEL: fcvtmu_1s1d_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu s0, d0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double %A)
+ %f = bitcast i32 %i to float
+ ret float %f
+}
+
+define double @fcvtmu_1d1s_simd(float %A) nounwind {
+; CHECK-LABEL: fcvtmu_1d1s_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu d0, s0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float %A)
+ %d = bitcast i64 %i to double
+ ret double %d
+}
+
+define dso_local float @fcvtmu_1s1h_simd(half %a) {
+; CHECK-LABEL: fcvtmu_1s1h_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu s0, h0
+; CHECK-NEXT: ret
+ %fcvt = tail call i32 @llvm.aarch64.neon.fcvtmu.i32.f16(half %a)
+ %f = bitcast i32 %fcvt to float
+ ret float %f
+}
+
+define dso_local double @fcvtmu_1d1h_simd(half %a) {
+; CHECK-LABEL: fcvtmu_1d1h_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu d0, h0
+; CHECK-NEXT: ret
+ %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtmu.i64.f16(half %a)
+ %d = bitcast i64 %vcvtah_s64_f16 to double
+ ret double %d
+}
+
+define dso_local double @fcvtmu_1d1d_simd(double %a) {
+; CHECK-LABEL: fcvtmu_1d1d_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu d0, d0
+; CHECK-NEXT: ret
+ %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %a)
+ %d = bitcast i64 %vcvtah_s64_f64 to double
+ ret double %d
+}
+
+define dso_local float @fcvtmu_1s1s_simd(float %a) {
+; CHECK-LABEL: fcvtmu_1s1s_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu s0, s0
+; CHECK-NEXT: ret
+ %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %a)
+ %d = bitcast i32 %vcvtah_s32_f32 to float
+ ret float %d
+}
+
+define float @fcvtns_1s1d_simd(double %A) nounwind {
+; CHECK-LABEL: fcvtns_1s1d_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtns s0, d0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double %A)
+ %f = bitcast i32 %i to float
+ ret float %f
+}...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - high level I was wondering if we could remove aarch64_neon_fcvtzs entirely and just use fptosi_sat, but for strict-fp we likely still need it.
defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>; | ||
defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>; | ||
|
||
defm FCVTAS : SIMDFPTwoScalarFCVT< 0, 0, 0b11100, "fcvtas", int_aarch64_neon_fcvtas>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In relation to this from #156892
I wanted to keep patterns together and unfortunately in tablegen you need to define records above their usage so I needed to move these instructions up.
Could we keep the instructions together with the correct kinds, and move the patterns later? I think it's OK to move the patterns later if you wanted to keep them together.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can do that, but that will require moving all patterns down, which seems more intrusive than moving couple of instructions up. Why is it so important to keep the instructions in their original location? The description of the section they are moved to matches their behaviour as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's just that I was considering the v1i32 instructions "scalar neon instructions" that are really operating on the first lane of a neon register and require hasNeon I believe. The SDr style instructions are "FP" instructions that tend to operate between FPR and GPR. I would still consider the v1i32 instructions "Advanced SIMD two scalar instructions" and a little different to normal fp instructions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok moved them back and moved patterns down.
} | ||
|
||
let mayRaiseFPException = 1, Uses = [FPCR] in | ||
multiclass SIMDFPTwoScalarFCVT<bit U, bit S, bits<5> opc, string asm, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you can fold this into SIMDFPTwoScalar providing they pass null_frag.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That will disable all instructions which use that class in FastISel. But if that is fine I can do that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
) This patch is a first in a series of patches that add codegen support for fcvt instructions that keep the result in 32-bit or 64-bit SIMD&FP registers. For a long time, LLVM primarily generated fpcvt instructions, which store the result in GPRs, resulting in extra moves when the value was used by NEON instructions that operate on SIMD&FP registers. Although patterns existed for generating the SIMD variants, they relied on single-element vector types (such as v1i32 or v1i64) to decide whether the SIMD variant should be selected. This was not useful, because many NEON intrinsics and other LLVM IR operations use scalar types (i32/i64) even though they expect the result to be stored in SIMD&FP registers. This patch is part of a series that addresses this and also adds support for generating these instructions in GlobalISel. To fix this in SelectionDAG, bitcasts of the result to a floating-point type serve as a hint that the SIMD variant of the conversion should be used, rather than relying on single-element vector types. These bitcasts are not currently generated by LLVM, but the goal is to add explicit bitcasts to the inputs and outputs of NEON intrinsics operating on integers in follow-up patches. For GlobalISel, the register bank selection algorithm is used to determine which variant to generate
) This patch is a first in a series of patches that add codegen support for fcvt instructions that keep the result in 32-bit or 64-bit SIMD&FP registers. For a long time, LLVM primarily generated fpcvt instructions, which store the result in GPRs, resulting in extra moves when the value was used by NEON instructions that operate on SIMD&FP registers. Although patterns existed for generating the SIMD variants, they relied on single-element vector types (such as v1i32 or v1i64) to decide whether the SIMD variant should be selected. This was not useful, because many NEON intrinsics and other LLVM IR operations use scalar types (i32/i64) even though they expect the result to be stored in SIMD&FP registers. This patch is part of a series that addresses this and also adds support for generating these instructions in GlobalISel. To fix this in SelectionDAG, bitcasts of the result to a floating-point type serve as a hint that the SIMD variant of the conversion should be used, rather than relying on single-element vector types. These bitcasts are not currently generated by LLVM, but the goal is to add explicit bitcasts to the inputs and outputs of NEON intrinsics operating on integers in follow-up patches. For GlobalISel, the register bank selection algorithm is used to determine which variant to generate
This patch is a first in a series of patches that add codegen support for fcvt instructions that keep the result in 32-bit or 64-bit SIMD&FP registers. For a long time, LLVM primarily generated fpcvt instructions, which store the result in GPRs, resulting in extra moves when the value was used by NEON instructions that operate on SIMD&FP registers. Although patterns existed for generating the SIMD variants, they relied on single-element vector types (such as v1i32 or v1i64) to decide whether the SIMD variant should be selected. This was not useful, because many NEON intrinsics and other LLVM IR operations use scalar types (i32/i64) even though they expect the result to be stored in SIMD&FP registers.
This patch is part of a series that addresses this and also adds support for generating these instructions in GlobalISel. To fix this in SelectionDAG, bitcasts of the result to a floating-point type serve as a hint that the SIMD variant of the conversion should be used, rather than relying on single-element vector types. These bitcasts are not currently generated by LLVM, but the goal is to add explicit bitcasts to the inputs and outputs of NEON intrinsics operating on integers in follow-up patches.
For GlobalISel, the register bank selection algorithm is used to determine which variant to generate