Skip to content

Commit

Permalink
[X86] Change precision control to FP80 during u64->fp32 conversion on…
Browse files Browse the repository at this point in the history
… Windows.

This is an alternative to D141074 to fix the problem by adjusting
the precision control dynamically.

Reviewed By: icedrocket

Differential Revision: https://reviews.llvm.org/D142178
  • Loading branch information
topperc committed Feb 6, 2023
1 parent 402981e commit 11fb09e
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 5 deletions.
81 changes: 78 additions & 3 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -22048,15 +22048,25 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// Extend everything to 80 bits to force it to be done on x87.
// TODO: Are there any fast-math-flags to propagate here?
if (IsStrict) {
SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
{Chain, Fild, Fudge});
unsigned Opc = ISD::STRICT_FADD;
// Windows needs the precision control changed to 80bits around this add.
if (Subtarget.isOSWindows() && DstVT == MVT::f32)
Opc = X86ISD::STRICT_FP80_ADD;

SDValue Add =
DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
// STRICT_FP_ROUND can't handle equal types.
if (DstVT == MVT::f80)
return Add;
return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
}
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
unsigned Opc = ISD::FADD;
// Windows needs the precision control changed to 80bits around this add.
if (Subtarget.isOSWindows() && DstVT == MVT::f32)
Opc = X86ISD::FP80_ADD;

SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
}
Expand Down Expand Up @@ -34881,6 +34891,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(AESDECWIDE256KL)
NODE_NAME_CASE(CMPCCXADD)
NODE_NAME_CASE(TESTUI)
NODE_NAME_CASE(FP80_ADD)
NODE_NAME_CASE(STRICT_FP80_ADD)
}
return nullptr;
#undef NODE_NAME_CASE
Expand Down Expand Up @@ -37356,6 +37368,69 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::CMOV_VK64:
return EmitLoweredSelect(MI, BB);

case X86::FP80_ADDr:
case X86::FP80_ADDm32: {
// Change the floating point control register to use double extended
// precision when performing the addition.
int OrigCWFrameIdx =
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),
OrigCWFrameIdx);

// Load the old value of the control word...
Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
OrigCWFrameIdx);

// OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
// precision.
Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
.addReg(OldCW, RegState::Kill)
.addImm(0x300);

// Extract to 16 bits.
Register NewCW16 =
MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
.addReg(NewCW, RegState::Kill, X86::sub_16bit);

// Prepare memory for FLDCW.
int NewCWFrameIdx =
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
NewCWFrameIdx)
.addReg(NewCW16, RegState::Kill);

// Reload the modified control word now...
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
NewCWFrameIdx);

// Do the addition.
if (MI.getOpcode() == X86::FP80_ADDr) {
BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))
.add(MI.getOperand(0))
.add(MI.getOperand(1))
.add(MI.getOperand(2));
} else {
BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))
.add(MI.getOperand(0))
.add(MI.getOperand(1))
.add(MI.getOperand(2))
.add(MI.getOperand(3))
.add(MI.getOperand(4))
.add(MI.getOperand(5))
.add(MI.getOperand(6));
}

// Reload the original control word now.
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
OrigCWFrameIdx);

MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}

case X86::FP32_TO_INT16_IN_MEM:
case X86::FP32_TO_INT32_IN_MEM:
case X86::FP32_TO_INT64_IN_MEM:
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Expand Up @@ -740,6 +740,9 @@ namespace llvm {
// User level interrupts - testui
TESTUI,

// Perform an FP80 add after changing precision control in FPCW.
FP80_ADD,

/// X86 strict FP compare instructions.
STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
STRICT_FCMPS,
Expand Down Expand Up @@ -779,6 +782,9 @@ namespace llvm {
STRICT_CVTPS2PH,
STRICT_CVTPH2PS,

// Perform an FP80 add after changing precision control in FPCW.
STRICT_FP80_ADD,

// WARNING: Only add nodes here if they are strict FP nodes. Non-memory and
// non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.

Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/X86/X86InstrFPStack.td
Expand Up @@ -26,6 +26,13 @@ def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
def SDTX86CwdLoad : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;

def X86fp80_add : SDNode<"X86ISD::FP80_ADD", SDTFPBinOp, [SDNPCommutative]>;
def X86strict_fp80_add : SDNode<"X86ISD::STRICT_FP80_ADD", SDTFPBinOp,
[SDNPHasChain,SDNPCommutative]>;
def any_X86fp80_add : PatFrags<(ops node:$lhs, node:$rhs),
[(X86strict_fp80_add node:$lhs, node:$rhs),
(X86fp80_add node:$lhs, node:$rhs)]>;

def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
Expand Down Expand Up @@ -141,6 +148,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [EFLAGS] in {
[(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
[(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;

def FP80_ADDr : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
[(set RFP80:$dst,
(any_X86fp80_add RFP80:$src1, RFP80:$src2))]>;
def FP80_ADDm32 : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2),
[(set RFP80:$dst,
(any_X86fp80_add RFP80:$src1,
(f80 (extloadf32 addr:$src2))))]>;
}

// All FP Stack operations are represented with four instructions here. The
Expand Down
48 changes: 46 additions & 2 deletions llvm/test/CodeGen/X86/uint64-to-float.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86-WIN
; RUN: llc < %s -mtriple=x86_64-windows -mattr=+sse2 | FileCheck %s --check-prefix=X64-WIN

; Verify that we are using the efficient uitofp --> sitofp lowering illustrated
; by the compiler_rt implementation of __floatundisf.
Expand Down Expand Up @@ -42,6 +44,48 @@ define float @test(i64 %a) nounwind {
; X64-NEXT: cvtsi2ss %rdi, %xmm0
; X64-NEXT: addss %xmm0, %xmm0
; X64-NEXT: retq
;
; X86-WIN-LABEL: test:
; X86-WIN: # %bb.0: # %entry
; X86-WIN-NEXT: pushl %ebp
; X86-WIN-NEXT: movl %esp, %ebp
; X86-WIN-NEXT: andl $-8, %esp
; X86-WIN-NEXT: subl $24, %esp
; X86-WIN-NEXT: movl 12(%ebp), %eax
; X86-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-WIN-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
; X86-WIN-NEXT: shrl $31, %eax
; X86-WIN-NEXT: fildll {{[0-9]+}}(%esp)
; X86-WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X86-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-WIN-NEXT: orl $768, %ecx # imm = 0x300
; X86-WIN-NEXT: movw %cx, {{[0-9]+}}(%esp)
; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X86-WIN-NEXT: fadds __real@5f80000000000000(,%eax,4)
; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X86-WIN-NEXT: fstps {{[0-9]+}}(%esp)
; X86-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-WIN-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; X86-WIN-NEXT: flds {{[0-9]+}}(%esp)
; X86-WIN-NEXT: movl %ebp, %esp
; X86-WIN-NEXT: popl %ebp
; X86-WIN-NEXT: retl
;
; X64-WIN-LABEL: test:
; X64-WIN: # %bb.0: # %entry
; X64-WIN-NEXT: testq %rcx, %rcx
; X64-WIN-NEXT: js .LBB0_1
; X64-WIN-NEXT: # %bb.2: # %entry
; X64-WIN-NEXT: cvtsi2ss %rcx, %xmm0
; X64-WIN-NEXT: retq
; X64-WIN-NEXT: .LBB0_1:
; X64-WIN-NEXT: movq %rcx, %rax
; X64-WIN-NEXT: shrq %rax
; X64-WIN-NEXT: andl $1, %ecx
; X64-WIN-NEXT: orq %rax, %rcx
; X64-WIN-NEXT: cvtsi2ss %rcx, %xmm0
; X64-WIN-NEXT: addss %xmm0, %xmm0
; X64-WIN-NEXT: retq
entry:
%b = uitofp i64 %a to float
ret float %b
Expand Down

0 comments on commit 11fb09e

Please sign in to comment.