Skip to content

Commit

Permalink
[NVPTX] Handle ldg created from sign-/zero-extended load
Browse files Browse the repository at this point in the history
Reviewers: jingyue

Subscribers: jholewinski

Differential Revision: http://reviews.llvm.org/D18053

llvm-svn: 265389
  • Loading branch information
jholewinski committed Apr 5, 2016
1 parent 188de5a commit c799792
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 4 deletions.
85 changes: 81 additions & 4 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Expand Up @@ -1286,7 +1286,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
MemSDNode *Mem;
bool IsLDG = true;

// If this is an LDG intrinsic, the address is the third operand. Its its an
// If this is an LDG intrinsic, the address is the third operand. If its an
// LDG/LDU SD node (from custom vector handling), then its the second operand
if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
Op1 = N->getOperand(2);
Expand Down Expand Up @@ -1317,10 +1317,23 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
SDValue Base, Offset, Addr;

EVT EltVT = Mem->getMemoryVT();
unsigned NumElts = 1;
if (EltVT.isVector()) {
NumElts = EltVT.getVectorNumElements();
EltVT = EltVT.getVectorElementType();
}

// Build the "promoted" result VTList for the load. If we are really loading
// i8s, then the return type will be promoted to i16 since we do not expose
// 8-bit registers in NVPTX.
EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
SmallVector<EVT, 5> InstVTs;
for (unsigned i = 0; i != NumElts; ++i) {
InstVTs.push_back(NodeVT);
}
InstVTs.push_back(MVT::Other);
SDVTList InstVTList = CurDAG->getVTList(InstVTs);

if (SelectDirectAddr(Op1, Addr)) {
switch (N->getOpcode()) {
default:
Expand Down Expand Up @@ -1461,7 +1474,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
}

SDValue Ops[] = { Addr, Chain };
LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
} else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
: SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
if (TM.is64Bit()) {
Expand Down Expand Up @@ -1750,7 +1763,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {

SDValue Ops[] = { Base, Offset, Chain };

LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
} else {
if (TM.is64Bit()) {
switch (N->getOpcode()) {
Expand Down Expand Up @@ -2037,13 +2050,77 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
}

SDValue Ops[] = { Op1, Chain };
LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
}

MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
MemRefs0[0] = Mem->getMemOperand();
cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);

// For automatic generation of LDG (through SelectLoad[Vector], not the
// intrinsics), we may have an extending load like:
//
// i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
//
// Since we load an i8 value, the matching logic above will have selected an
// LDG instruction that reads i8 and stores it in an i16 register (NVPTX does
// not expose 8-bit registers):
//
// i16,ch = INT_PTX_LDG_GLOBAL_i8areg64 t7, t0
//
// To get the correct type in this case, truncate back to i8 and then extend
// to the original load type.
EVT OrigType = N->getValueType(0);
LoadSDNode *LDSD = dyn_cast<LoadSDNode>(N);
if (LDSD && EltVT == MVT::i8 && OrigType.getScalarSizeInBits() >= 32) {
unsigned CvtOpc = 0;

switch (LDSD->getExtensionType()) {
default:
llvm_unreachable("An extension is required for i8 loads");
break;
case ISD::SEXTLOAD:
switch (OrigType.getSimpleVT().SimpleTy) {
default:
llvm_unreachable("Unhandled integer load type");
break;
case MVT::i32:
CvtOpc = NVPTX::CVT_s32_s8;
break;
case MVT::i64:
CvtOpc = NVPTX::CVT_s64_s8;
break;
}
break;
case ISD::EXTLOAD:
case ISD::ZEXTLOAD:
switch (OrigType.getSimpleVT().SimpleTy) {
default:
llvm_unreachable("Unhandled integer load type");
break;
case MVT::i32:
CvtOpc = NVPTX::CVT_u32_u8;
break;
case MVT::i64:
CvtOpc = NVPTX::CVT_u64_u8;
break;
}
break;
}

// For each output value, truncate to i8 (since the upper 8 bits are
// undefined) and then extend to the desired type.
for (unsigned i = 0; i != NumElts; ++i) {
SDValue Res(LD, i);
SDValue OrigVal(N, i);

SDNode *CvtNode =
CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32));
ReplaceUses(OrigVal, SDValue(CvtNode, 0));
}
}

return LD;
}

Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Expand Up @@ -319,6 +319,16 @@ let hasSideEffects = 0 in {
// takes a CvtMode immediate that defines the conversion mode to use. It can
// be CvtNONE to omit a conversion mode.
multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
def _s8 :
NVPTXInst<(outs RC:$dst),
(ins Int16Regs:$src, CvtMode:$mode),
!strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
FromName, ".s8\t$dst, $src;"), []>;
def _u8 :
NVPTXInst<(outs RC:$dst),
(ins Int16Regs:$src, CvtMode:$mode),
!strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
FromName, ".u8\t$dst, $src;"), []>;
def _s16 :
NVPTXInst<(outs RC:$dst),
(ins Int16Regs:$src, CvtMode:$mode),
Expand Down
57 changes: 57 additions & 0 deletions llvm/test/CodeGen/NVPTX/bug26185.ll
@@ -0,0 +1,57 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s

; Verify that we correctly emit code for i8 ldg/ldu. We do not expose 8-bit
; registers in the backend, so these loads need special handling.

target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-unknown-unknown"

; CHECK-LABEL: ex_zext
define void @ex_zext(i8* noalias readonly %data, i32* %res) {
entry:
; CHECK: ld.global.nc.u8
%val = load i8, i8* %data
; CHECK: cvt.u32.u8
%valext = zext i8 %val to i32
store i32 %valext, i32* %res
ret void
}

; CHECK-LABEL: ex_sext
define void @ex_sext(i8* noalias readonly %data, i32* %res) {
entry:
; CHECK: ld.global.nc.u8
%val = load i8, i8* %data
; CHECK: cvt.s32.s8
%valext = sext i8 %val to i32
store i32 %valext, i32* %res
ret void
}

; CHECK-LABEL: ex_zext_v2
define void @ex_zext_v2(<2 x i8>* noalias readonly %data, <2 x i32>* %res) {
entry:
; CHECK: ld.global.nc.v2.u8
%val = load <2 x i8>, <2 x i8>* %data
; CHECK: cvt.u32.u16
%valext = zext <2 x i8> %val to <2 x i32>
store <2 x i32> %valext, <2 x i32>* %res
ret void
}

; CHECK-LABEL: ex_sext_v2
define void @ex_sext_v2(<2 x i8>* noalias readonly %data, <2 x i32>* %res) {
entry:
; CHECK: ld.global.nc.v2.u8
%val = load <2 x i8>, <2 x i8>* %data
; CHECK: cvt.s32.s8
%valext = sext <2 x i8> %val to <2 x i32>
store <2 x i32> %valext, <2 x i32>* %res
ret void
}

!nvvm.annotations = !{!0,!1,!2,!3}
!0 = !{void (i8*, i32*)* @ex_zext, !"kernel", i32 1}
!1 = !{void (i8*, i32*)* @ex_sext, !"kernel", i32 1}
!2 = !{void (<2 x i8>*, <2 x i32>*)* @ex_zext_v2, !"kernel", i32 1}
!3 = !{void (<2 x i8>*, <2 x i32>*)* @ex_sext_v2, !"kernel", i32 1}

0 comments on commit c799792

Please sign in to comment.