[NVPTX] Handle ldg created from sign-/zero-extended load

Reviewers: jingyue Subscribers: jholewinski Differential Revision: http://reviews.llvm.org/D18053 llvm-svn: 265389
llvm · Apr 5, 2016 · c799792 · c799792
1 parent 188de5a
commit c799792
Show file tree

Hide file tree

Showing 3 changed files with 148 additions and 4 deletions.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1286,7 +1286,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
   MemSDNode *Mem;
   bool IsLDG = true;
 
-  // If this is an LDG intrinsic, the address is the third operand. Its its an
+  // If this is an LDG intrinsic, the address is the third operand. If its an
   // LDG/LDU SD node (from custom vector handling), then its the second operand
   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     Op1 = N->getOperand(2);
@@ -1317,10 +1317,23 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
   SDValue Base, Offset, Addr;
 
   EVT EltVT = Mem->getMemoryVT();
+  unsigned NumElts = 1;
   if (EltVT.isVector()) {
+    NumElts = EltVT.getVectorNumElements();
     EltVT = EltVT.getVectorElementType();
   }
 
+  // Build the "promoted" result VTList for the load. If we are really loading
+  // i8s, then the return type will be promoted to i16 since we do not expose
+  // 8-bit registers in NVPTX.
+  EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
+  SmallVector<EVT, 5> InstVTs;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    InstVTs.push_back(NodeVT);
+  }
+  InstVTs.push_back(MVT::Other);
+  SDVTList InstVTList = CurDAG->getVTList(InstVTs);
+
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
@@ -1461,7 +1474,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
     }
 
     SDValue Ops[] = { Addr, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (TM.is64Bit()) {
@@ -1750,7 +1763,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
 
     SDValue Ops[] = { Base, Offset, Chain };
 
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
   } else {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
@@ -2037,13 +2050,77 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
     }
 
     SDValue Ops[] = { Op1, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = Mem->getMemOperand();
   cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
+  // For automatic generation of LDG (through SelectLoad[Vector], not the
+  // intrinsics), we may have an extending load like:
+  //
+  //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
+  //
+  // Since we load an i8 value, the matching logic above will have selected an
+  // LDG instruction that reads i8 and stores it in an i16 register (NVPTX does
+  // not expose 8-bit registers):
+  //
+  //   i16,ch = INT_PTX_LDG_GLOBAL_i8areg64 t7, t0
+  //
+  // To get the correct type in this case, truncate back to i8 and then extend
+  // to the original load type.
+  EVT OrigType = N->getValueType(0);
+  LoadSDNode *LDSD = dyn_cast<LoadSDNode>(N);
+  if (LDSD && EltVT == MVT::i8 && OrigType.getScalarSizeInBits() >= 32) {
+    unsigned CvtOpc = 0;
+
+    switch (LDSD->getExtensionType()) {
+    default:
+      llvm_unreachable("An extension is required for i8 loads");
+      break;
+    case ISD::SEXTLOAD:
+      switch (OrigType.getSimpleVT().SimpleTy) {
+      default:
+        llvm_unreachable("Unhandled integer load type");
+        break;
+      case MVT::i32:
+        CvtOpc = NVPTX::CVT_s32_s8;
+        break;
+      case MVT::i64:
+        CvtOpc = NVPTX::CVT_s64_s8;
+        break;
+      }
+      break;
+    case ISD::EXTLOAD:
+    case ISD::ZEXTLOAD:
+      switch (OrigType.getSimpleVT().SimpleTy) {
+      default:
+        llvm_unreachable("Unhandled integer load type");
+        break;
+      case MVT::i32:
+        CvtOpc = NVPTX::CVT_u32_u8;
+        break;
+      case MVT::i64:
+        CvtOpc = NVPTX::CVT_u64_u8;
+        break;
+      }
+      break;
+    }
+
+    // For each output value, truncate to i8 (since the upper 8 bits are
+    // undefined) and then extend to the desired type.
+    for (unsigned i = 0; i != NumElts; ++i) {
+      SDValue Res(LD, i);
+      SDValue OrigVal(N, i);
+
+      SDNode *CvtNode =
+        CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
+                               CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32));
+      ReplaceUses(OrigVal, SDValue(CvtNode, 0));
+    }
+  }
+
   return LD;
 }
 

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -319,6 +319,16 @@ let hasSideEffects = 0 in {
   // takes a CvtMode immediate that defines the conversion mode to use.  It can
   // be CvtNONE to omit a conversion mode.
   multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
+    def _s8 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s8\t$dst, $src;"), []>;
+    def _u8 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u8\t$dst, $src;"), []>;
     def _s16 :
       NVPTXInst<(outs RC:$dst),
                 (ins Int16Regs:$src, CvtMode:$mode),

diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+
+; Verify that we correctly emit code for i8 ldg/ldu. We do not expose 8-bit
+; registers in the backend, so these loads need special handling.
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+; CHECK-LABEL: ex_zext
+define void @ex_zext(i8* noalias readonly %data, i32* %res) {
+entry:
+; CHECK: ld.global.nc.u8
+  %val = load i8, i8* %data
+; CHECK: cvt.u32.u8
+  %valext = zext i8 %val to i32
+  store i32 %valext, i32* %res
+  ret void
+}
+
+; CHECK-LABEL: ex_sext
+define void @ex_sext(i8* noalias readonly %data, i32* %res) {
+entry:
+; CHECK: ld.global.nc.u8
+  %val = load i8, i8* %data
+; CHECK: cvt.s32.s8
+  %valext = sext i8 %val to i32
+  store i32 %valext, i32* %res
+  ret void
+}
+
+; CHECK-LABEL: ex_zext_v2
+define void @ex_zext_v2(<2 x i8>* noalias readonly %data, <2 x i32>* %res) {
+entry:
+; CHECK: ld.global.nc.v2.u8
+  %val = load <2 x i8>, <2 x i8>* %data
+; CHECK: cvt.u32.u16
+  %valext = zext <2 x i8> %val to <2 x i32>
+  store <2 x i32> %valext, <2 x i32>* %res
+  ret void
+}
+
+; CHECK-LABEL: ex_sext_v2
+define void @ex_sext_v2(<2 x i8>* noalias readonly %data, <2 x i32>* %res) {
+entry:
+; CHECK: ld.global.nc.v2.u8
+  %val = load <2 x i8>, <2 x i8>* %data
+; CHECK: cvt.s32.s8
+  %valext = sext <2 x i8> %val to <2 x i32>
+  store <2 x i32> %valext, <2 x i32>* %res
+  ret void
+}
+
+!nvvm.annotations = !{!0,!1,!2,!3}
+!0 = !{void (i8*, i32*)* @ex_zext, !"kernel", i32 1}
+!1 = !{void (i8*, i32*)* @ex_sext, !"kernel", i32 1}
+!2 = !{void (<2 x i8>*, <2 x i32>*)* @ex_zext_v2, !"kernel", i32 1}
+!3 = !{void (<2 x i8>*, <2 x i32>*)* @ex_sext_v2, !"kernel", i32 1}