[Sparc] efficient pattern for UINT_TO_FP conversion

Summary: while investigating performance degradation of imagick benchmark there were found inefficient pattern for UINT_TO_FP conversion. That pattern causes RAW hazard in assembly code. Specifically, uitofp IR operator results in poor assembler : st %i0, [%fp - 952] ldd [%fp - 952], %f0 it stores 32-bit integer register into memory location and then loads 64-bit floating point data from that location. That is exactly RAW hazard case. To optimize that case it is possible to use SPISD::ITOF and SPISD::XTOF for conversion from integer to floating point data type and to use ISD::BITCAST to copy from integer register into floating point register. The fix is to write custom UINT_TO_FP pattern using SPISD::ITOF, SPISD::XTOF, ISD::BITCAST. Patch by Alexey Lapshin Reviewers: fedor.sergeev, jyknight, dcederman, lero_chris Reviewed By: jyknight Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D36875 llvm-svn: 318704
llvm · Nov 20, 2017 · a476117 · a476117
1 parent 2bc260a
commit a476117
Show file tree

Hide file tree

Showing 4 changed files with 144 additions and 30 deletions.
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1559,9 +1559,6 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 
-  setOperationAction(ISD::BITCAST, MVT::f32, Expand);
-  setOperationAction(ISD::BITCAST, MVT::i32, Expand);
-
   // Sparc has no select or setcc: expand to SELECT_CC.
   setOperationAction(ISD::SELECT, MVT::i32, Expand);
   setOperationAction(ISD::SELECT, MVT::f32, Expand);
@@ -1590,13 +1587,14 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 
+  setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+  setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::ADDC, MVT::i64, Custom);
     setOperationAction(ISD::ADDE, MVT::i64, Custom);
     setOperationAction(ISD::SUBC, MVT::i64, Custom);
     setOperationAction(ISD::SUBE, MVT::i64, Custom);
-    setOperationAction(ISD::BITCAST, MVT::f64, Expand);
-    setOperationAction(ISD::BITCAST, MVT::i64, Expand);
     setOperationAction(ISD::SELECT, MVT::i64, Expand);
     setOperationAction(ISD::SETCC, MVT::i64, Expand);
     setOperationAction(ISD::BR_CC, MVT::i64, Custom);
@@ -1610,6 +1608,9 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ROTL , MVT::i64, Expand);
     setOperationAction(ISD::ROTR , MVT::i64, Expand);
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+
+    setOperationAction(ISD::BITCAST, MVT::i64, Custom);
+    setOperationAction(ISD::BITCAST, MVT::f64, Custom);
   }
 
   // ATOMICs.
@@ -2425,23 +2426,76 @@ static SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG,
                          1);
 }
 
-static SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG,
-                               const SparcTargetLowering &TLI,
-                               bool hasHardQuad) {
+SDValue SparcTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT SrcVT = Op.getOperand(0).getValueType();
+
+  EVT DstVT = Op.getValueType();
+
+  if (Subtarget->isVIS3()) {
+    if (DstVT == MVT::f32 && SrcVT == MVT::i32) {
+      return Op; // Legal
+    } else if (DstVT == MVT::f64 && SrcVT == MVT::i64) {
+      return (Subtarget->is64Bit())
+                 ? Op
+                 : SDValue(); // Legal on 64 bit, otherwise Expand
+    } else if (DstVT == MVT::i64 && SrcVT == MVT::f64) {
+      return (Subtarget->is64Bit())
+                 ? Op
+                 : SDValue(); // Legal on 64 bit, otherwise Expand
+    }
+  }
+
+  // Expand
+  return SDValue();
+}
+
+SDValue SparcTargetLowering::LowerUINT_TO_FP(SDValue Op,
+                                             SelectionDAG &DAG) const {
   SDLoc dl(Op);
   EVT OpVT = Op.getOperand(0).getValueType();
   assert(OpVT == MVT::i32 || OpVT == MVT::i64);
 
-  // Expand if it does not involve f128 or the target has support for
-  // quad floating point instructions and the operand type is legal.
-  if (Op.getValueType() != MVT::f128 || (hasHardQuad && TLI.isTypeLegal(OpVT)))
-    return SDValue();
+  // Expand f128 operations to fp128 ABI calls.
+  if (Op.getValueType() == MVT::f128 &&
+      (!Subtarget->hasHardQuad() || !isTypeLegal(OpVT))) {
+    return LowerF128Op(Op, DAG,
+                       getLibcallName(OpVT == MVT::i32
+                                          ? RTLIB::UINTTOFP_I32_F128
+                                          : RTLIB::UINTTOFP_I64_F128),
+                       1);
+  }
+
+  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+  // the optimization here.
+  if (DAG.SignBitIsZero(Op.getOperand(0))) {
+
+    EVT floatVT = MVT::f32;
+    unsigned IntToFloatOpcode = SPISD::ITOF;
+
+    if (OpVT == MVT::i64) {
+      floatVT = MVT::f64;
+      IntToFloatOpcode = SPISD::XTOF;
+    }
 
-  return TLI.LowerF128Op(Op, DAG,
-                         TLI.getLibcallName(OpVT == MVT::i32
-                                            ? RTLIB::UINTTOFP_I32_F128
-                                            : RTLIB::UINTTOFP_I64_F128),
-                         1);
+    // Convert the int value to FP in an FP register.
+    SDValue FloatTmp = DAG.getNode(ISD::BITCAST, dl, floatVT, Op.getOperand(0));
+
+    return DAG.getNode(IntToFloatOpcode, dl, Op.getValueType(), FloatTmp);
+  }
+
+  if (OpVT == MVT::i32 && Subtarget->is64Bit()) {
+
+    SDValue Int64Tmp =
+        DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op.getOperand(0));
+
+    SDValue Float64Tmp = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Int64Tmp);
+
+    return DAG.getNode(SPISD::XTOF, dl, Op.getValueType(), Float64Tmp);
+  }
+
+  return SDValue();
 }
 
 static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG,
@@ -3059,8 +3113,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
                                                        hasHardQuad);
   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG, *this,
                                                        hasHardQuad);
-  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG, *this,
-                                                       hasHardQuad);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   case ISD::BR_CC:              return LowerBR_CC(Op, DAG, *this,
                                                   hasHardQuad);
   case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG, *this,
@@ -3097,6 +3150,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:       return LowerATOMIC_LOAD_STORE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
   }
 }
 

diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -192,6 +192,10 @@ namespace llvm {
 
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
+    SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+
     bool ShouldShrinkFPConstant(EVT VT) const override {
       // Do not shrink FP constpool if VT == MVT::f128.
       // (ldd, call _Q_fdtoq) is more expensive than two ldds.

diff --git a/llvm/lib/Target/Sparc/SparcInstrVIS.td b/llvm/lib/Target/Sparc/SparcInstrVIS.td
@@ -243,16 +243,21 @@ def LZCNT     : VISInstFormat<0b000010111, (outs I64Regs:$rd),
                    (ins I64Regs:$rs2), "lzcnt $rs2, $rd", []>;
 
 let rs1 = 0 in {
-def MOVSTOSW : VISInstFormat<0b100010011, (outs I64Regs:$rd),
-                   (ins DFPRegs:$rs2), "movstosw $rs2, $rd", []>;
-def MOVSTOUW : VISInstFormat<0b100010001, (outs I64Regs:$rd),
-                   (ins DFPRegs:$rs2), "movstouw $rs2, $rd", []>;
-def MOVDTOX  : VISInstFormat<0b100010000, (outs I64Regs:$rd),
-                   (ins DFPRegs:$rs2), "movdtox $rs2, $rd", []>;
-def MOVWTOS  :  VISInstFormat<0b100011001, (outs DFPRegs:$rd),
-                   (ins I64Regs:$rs2), "movdtox $rs2, $rd", []>;
-def MOVXTOD  :  VISInstFormat<0b100011000, (outs DFPRegs:$rd),
-                   (ins I64Regs:$rs2), "movdtox $rs2, $rd", []>;
+def MOVSTOSW : VISInstFormat<0b100010011, (outs I64Regs:$rd), (ins FPRegs:$rs2), 
+                   "movstosw $rs2, $rd", 
+                   [(set I64Regs:$rd, (sext (i32 (bitconvert FPRegs:$rs2))))]>;
+def MOVSTOUW : VISInstFormat<0b100010001, (outs I64Regs:$rd), (ins FPRegs:$rs2), 
+                   "movstouw $rs2, $rd", 
+                   [(set I64Regs:$rd, (zext (i32 (bitconvert FPRegs:$rs2))))]>;
+def MOVDTOX  : VISInstFormat<0b100010000, (outs I64Regs:$rd), (ins DFPRegs:$rs2), 
+                   "movdtox $rs2, $rd", 
+                   [(set I64Regs:$rd, (bitconvert DFPRegs:$rs2))]>;
+def MOVWTOS  :  VISInstFormat<0b100011001, (outs FPRegs:$rd), (ins IntRegs:$rs2), 
+                   "movwtos $rs2, $rd", 
+                   [(set FPRegs:$rd, (bitconvert i32:$rs2))]>;
+def MOVXTOD  :  VISInstFormat<0b100011000, (outs DFPRegs:$rd), (ins I64Regs:$rs2),
+                   "movxtod $rs2, $rd", 
+                   [(set DFPRegs:$rd, (bitconvert I64Regs:$rs2))]>;
 }
 
 def PDISTN   : VISInst<0b000111111, "pdistn">;

diff --git a/llvm/test/CodeGen/SPARC/float.ll b/llvm/test/CodeGen/SPARC/float.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -march=sparc -O0 < %s | FileCheck %s -check-prefix=V8-UNOPT
 ; RUN: llc -march=sparc -mattr=v9 < %s | FileCheck %s -check-prefix=V9
 ; RUN: llc -mtriple=sparc64-unknown-linux < %s | FileCheck %s -check-prefix=SPARC64
+; RUN: llc -march=sparc -mcpu=niagara4 < %s  | FileCheck %s -check-prefix=VIS3
+; RUN: llc -march=sparcv9 -mcpu=niagara4 < %s | FileCheck %s -check-prefix=VIS3-64
 
 ; V8-LABEL:     test_neg:
 ; V8:     call get_double
@@ -194,7 +196,7 @@ entry:
 ; V9:          fstoi
 
 ; SPARC64-LABEL:    test_utos_stou
-; SPARC64:     fdtos
+; SPARC64:     fxtos
 ; SPARC64:     fstoi
 
 define void @test_utos_stou(i32 %a, i32* %ptr0, float* %ptr1) {
@@ -240,6 +242,9 @@ entry:
 ; SPARC64-NOT:      fitod
 ; SPARC64:          fdtoi
 
+; VIS3-64-LABEL:  test_utod_dtou
+; VIS3-64:        movxtod 
+
 define void @test_utod_dtou(i32 %a, double %b, i32* %ptr0, double* %ptr1) {
 entry:
   %0 = uitofp i32 %a to double
@@ -248,3 +253,49 @@ entry:
   store i32 %1, i32* %ptr0, align 8
   ret void
 }
+
+; V8-LABEL:    test_ustod
+; V8:          fitod
+
+; VIS3-LABEL:  test_ustod
+; VIS3:        movwtos 
+
+define double @test_ustod(i16 zeroext) {
+  %2 = uitofp i16 %0 to double
+  ret double %2
+}
+
+; V8-LABEL:    test_ustos
+; V8:          fitos
+
+; VIS3-LABEL:  test_ustos
+; VIS3:        movwtos
+
+define float @test_ustos(i16 zeroext) {
+  %2 = uitofp i16 %0 to float 
+  ret float %2
+}
+
+; check for movwtos used for bitcast 
+;
+; VIS3-LABEL:  test_bitcast_utos 
+; VIS3:movwtos  
+
+define float @test_bitcast_utos(i32 ) {
+  %2 = bitcast i32 %0 to float 
+  ret float %2
+}
+
+
+; check for movxtod used for bitcast 
+;
+; VIS3-64-LABEL:  test_bitcast_uxtod 
+; VIS3-64:movxtod  
+
+define double @test_bitcast_uxtod(i64 ) {
+  %2 = bitcast i64 %0 to double 
+  ret double %2
+}
+
+
+