Skip to content

Commit

Permalink
[ARM] VCVTT fpround instruction selection
Browse files Browse the repository at this point in the history
Similar to the recent patch for fpext, this adds vcvtb and vcvtt with
insert into vector instruction selection patterns for fptruncs. This
helps clear up a lot of register shuffling that we would otherwise do.

Differential Revision: https://reviews.llvm.org/D81637
  • Loading branch information
davemgreen committed Jun 26, 2020
1 parent 712b0a2 commit d428f88
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 402 deletions.
15 changes: 14 additions & 1 deletion llvm/lib/Target/ARM/ARMInstrVFP.td
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,12 @@ def : FP16Pat<(f16 (fpround SPR:$Sm)),
(COPY_TO_REGCLASS (VCVTBSH SPR:$Sm), HPR)>;
def : FP16Pat<(fp_to_f16 SPR:$a),
(i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane),
(v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTBSH SPR:$src2),
(SSubReg_f16_reg imm:$lane)))>;
def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane),
(v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTBSH SPR:$src2),
(SSubReg_f16_reg imm:$lane)))>;

def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
Expand All @@ -772,10 +778,17 @@ def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))),

def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
[/* For disassembly only; pattern left blank */]>,
[/* Intentionally left blank, see patterns below */]>,
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;

def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
(v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH SPR:$src2),
(SSubReg_f16_reg imm:$lane)))>;
def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
(v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTTSH SPR:$src2),
(SSubReg_f16_reg imm:$lane)))>;

def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
(outs DPR:$Dd), (ins SPR:$Sm),
NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
Expand Down
50 changes: 19 additions & 31 deletions llvm/test/CodeGen/ARM/fp16-insert-extract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -77,19 +77,15 @@ entry:
define <4 x half> @test_vset_lane_f16(<4 x half> %a, float %fb) nounwind {
; CHECKHARD-LABEL: test_vset_lane_f16:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vcvtb.f16.f32 s2, s2
; CHECKHARD-NEXT: vmov r0, s2
; CHECKHARD-NEXT: vmov.16 d0[3], r0
; CHECKHARD-NEXT: vcvtt.f16.f32 s1, s2
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: test_vset_lane_f16:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov s0, r2
; CHECKSOFT-NEXT: vcvtb.f16.f32 s0, s0
; CHECKSOFT-NEXT: vmov d16, r0, r1
; CHECKSOFT-NEXT: vmov r2, s0
; CHECKSOFT-NEXT: vmov.16 d16[3], r2
; CHECKSOFT-NEXT: vmov r0, r1, d16
; CHECKSOFT-NEXT: vmov d0, r0, r1
; CHECKSOFT-NEXT: vmov s2, r2
; CHECKSOFT-NEXT: vcvtt.f16.f32 s1, s2
; CHECKSOFT-NEXT: vmov r0, r1, d0
; CHECKSOFT-NEXT: bx lr
entry:
%b = fptrunc float %fb to half
Expand All @@ -100,21 +96,17 @@ entry:
define <8 x half> @test_vset_laneq_f16_1(<8 x half> %a, float %fb) nounwind {
; CHECKHARD-LABEL: test_vset_laneq_f16_1:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vcvtb.f16.f32 s4, s4
; CHECKHARD-NEXT: vmov r0, s4
; CHECKHARD-NEXT: vmov.16 d0[1], r0
; CHECKHARD-NEXT: vcvtt.f16.f32 s0, s4
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: test_vset_laneq_f16_1:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vldr s0, [sp]
; CHECKSOFT-NEXT: vmov d17, r2, r3
; CHECKSOFT-NEXT: vmov d16, r0, r1
; CHECKSOFT-NEXT: vcvtb.f16.f32 s0, s0
; CHECKSOFT-NEXT: vmov r12, s0
; CHECKSOFT-NEXT: vmov.16 d16[1], r12
; CHECKSOFT-NEXT: vmov r2, r3, d17
; CHECKSOFT-NEXT: vmov r0, r1, d16
; CHECKSOFT-NEXT: vmov d1, r2, r3
; CHECKSOFT-NEXT: vldr s4, [sp]
; CHECKSOFT-NEXT: vmov d0, r0, r1
; CHECKSOFT-NEXT: vcvtt.f16.f32 s0, s4
; CHECKSOFT-NEXT: vmov r2, r3, d1
; CHECKSOFT-NEXT: vmov r0, r1, d0
; CHECKSOFT-NEXT: bx lr
entry:
%b = fptrunc float %fb to half
Expand All @@ -125,21 +117,17 @@ entry:
define <8 x half> @test_vset_laneq_f16_7(<8 x half> %a, float %fb) nounwind {
; CHECKHARD-LABEL: test_vset_laneq_f16_7:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vcvtb.f16.f32 s4, s4
; CHECKHARD-NEXT: vmov r0, s4
; CHECKHARD-NEXT: vmov.16 d1[3], r0
; CHECKHARD-NEXT: vcvtt.f16.f32 s3, s4
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: test_vset_laneq_f16_7:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vldr s0, [sp]
; CHECKSOFT-NEXT: vmov d17, r2, r3
; CHECKSOFT-NEXT: vmov d16, r0, r1
; CHECKSOFT-NEXT: vcvtb.f16.f32 s0, s0
; CHECKSOFT-NEXT: vmov r12, s0
; CHECKSOFT-NEXT: vmov.16 d17[3], r12
; CHECKSOFT-NEXT: vmov r0, r1, d16
; CHECKSOFT-NEXT: vmov r2, r3, d17
; CHECKSOFT-NEXT: vmov d1, r2, r3
; CHECKSOFT-NEXT: vldr s4, [sp]
; CHECKSOFT-NEXT: vmov d0, r0, r1
; CHECKSOFT-NEXT: vcvtt.f16.f32 s3, s4
; CHECKSOFT-NEXT: vmov r0, r1, d0
; CHECKSOFT-NEXT: vmov r2, r3, d1
; CHECKSOFT-NEXT: bx lr
entry:
%b = fptrunc float %fb to half
Expand Down
43 changes: 14 additions & 29 deletions llvm/test/CodeGen/Thumb2/mve-div-expand.ll
Original file line number Diff line number Diff line change
Expand Up @@ -968,8 +968,8 @@ entry:
define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) {
; CHECK-LABEL: frem_f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vmov q5, q0
Expand All @@ -979,76 +979,61 @@ define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) {
; CHECK-NEXT: vcvtb.f32.f16 s0, s16
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vcvtt.f32.f16 s0, s20
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov s24, r0
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vcvtt.f32.f16 s0, s16
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmov.16 q6[0], r4
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vcvtb.f16.f32 s24, s24
; CHECK-NEXT: vcvtt.f16.f32 s24, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s21
; CHECK-NEXT: vmov.16 q6[1], r0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s17
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vcvtb.f16.f32 s25, s0
; CHECK-NEXT: vcvtt.f32.f16 s0, s21
; CHECK-NEXT: vmov.16 q6[2], r0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vcvtt.f32.f16 s0, s17
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vcvtt.f16.f32 s25, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s22
; CHECK-NEXT: vmov.16 q6[3], r0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s18
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vcvtb.f16.f32 s26, s0
; CHECK-NEXT: vcvtt.f32.f16 s0, s22
; CHECK-NEXT: vmov.16 q6[4], r0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vcvtt.f32.f16 s0, s18
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vcvtt.f16.f32 s26, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s23
; CHECK-NEXT: vmov.16 q6[5], r0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s19
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vcvtb.f16.f32 s27, s0
; CHECK-NEXT: vcvtt.f32.f16 s0, s23
; CHECK-NEXT: vmov.16 q6[6], r0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vcvtt.f32.f16 s0, s19
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov.16 q6[7], r0
; CHECK-NEXT: vcvtt.f16.f32 s27, s0
; CHECK-NEXT: vmov q0, q6
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: pop {r7, pc}
entry:
%out = frem <8 x half> %in1, %in2
ret <8 x half> %out
Expand Down
Loading

0 comments on commit d428f88

Please sign in to comment.