Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 44 additions & 3 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -713,8 +713,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
Custom);
}

setOperationAction(ISD::BSWAP, MVT::i16, Expand);

setOperationAction(ISD::BR_JT, MVT::Other, Custom);
setOperationAction(ISD::BRIND, MVT::Other, Expand);

Expand Down Expand Up @@ -1106,6 +1104,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// * MVT::Other - internal.addrspace.wrap
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);

// Custom lowering for bswap
setOperationAction(ISD::BSWAP, {MVT::i16, MVT::i32, MVT::i64, MVT::v2i16},
Custom);
}

TargetLoweringBase::LegalizeTypeAction
Expand Down Expand Up @@ -2570,6 +2572,44 @@ static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG) {
return Tcgen05StNode;
}

static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
SDValue Src = Op.getOperand(0);
EVT VT = Op.getValueType();

switch (VT.getSimpleVT().SimpleTy) {
case MVT::i16: {
SDValue Extended = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
SDValue Swapped =
getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG);
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped);
}
case MVT::i32: {
return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
}
case MVT::v2i16: {
SDValue Converted = DAG.getBitcast(MVT::i32, Src);
SDValue Swapped =
getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG);
return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped);
}
case MVT::i64: {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Handle v2i32?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not need to have custom lowering for v2i32, as it is not a legal type. It will be split into two bswap of i32 during type legalization, so it will be handled by the i32 case above.

SDValue UnpackSrc =
DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, Src);
SDValue SwappedLow =
getPRMT(UnpackSrc.getValue(0), DAG.getConstant(0, DL, MVT::i32), 0x0123,
DL, DAG);
SDValue SwappedHigh =
getPRMT(UnpackSrc.getValue(1), DAG.getConstant(0, DL, MVT::i32), 0x0123,
DL, DAG);
return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64,
{SwappedHigh, SwappedLow});
}
default:
llvm_unreachable("unsupported type for bswap");
}
}

static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
switch (IID) {
case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
Expand Down Expand Up @@ -3193,7 +3233,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerCTLZCTPOP(Op, DAG);
case ISD::FREM:
return lowerFREM(Op, DAG);

case ISD::BSWAP:
return lowerBSWAP(Op, DAG);
default:
llvm_unreachable("Custom lowering not defined for operation");
}
Expand Down
32 changes: 0 additions & 32 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2468,38 +2468,6 @@ let Predicates = [hasPTX<73>, hasSM<52>] in {

include "NVPTXIntrinsics.td"

//-----------------------------------
// Notes
//-----------------------------------
// BSWAP is currently expanded. The following is a more efficient
// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
// - for sm_20, use pmpt (use vector scalar mov to get the pack and
// unpack). sm_20 supports native 32-bit register, but not native 16-bit
// register.

def : Pat <
(i32 (bswap i32:$a)),
(PRMT_B32rii $a, (i32 0), (i32 0x0123), PrmtNONE)>;

def : Pat <
(v2i16 (bswap v2i16:$a)),
(PRMT_B32rii $a, (i32 0), (i32 0x2301), PrmtNONE)>;

def : Pat <
(i64 (bswap i64:$a)),
(V2I32toI64
(PRMT_B32rii (I64toI32H_Sink $a), (i32 0), (i32 0x0123), PrmtNONE),
(PRMT_B32rii (I64toI32L_Sink $a), (i32 0), (i32 0x0123), PrmtNONE))>,
Requires<[hasPTX<71>]>;

// Fall back to the old way if we don't have PTX 7.1.
def : Pat <
(i64 (bswap i64:$a)),
(V2I32toI64
(PRMT_B32rii (I64toI32H $a), (i32 0), (i32 0x0123), PrmtNONE),
(PRMT_B32rii (I64toI32L $a), (i32 0), (i32 0x0123), PrmtNONE))>;


////////////////////////////////////////////////////////////////////////////////
// PTX Fence instructions
////////////////////////////////////////////////////////////////////////////////
Expand Down
74 changes: 33 additions & 41 deletions llvm/test/CodeGen/NVPTX/bswap.ll
Original file line number Diff line number Diff line change
@@ -1,25 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %}
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s
; RUN: %if ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}

target triple = "nvptx64-nvidia-cuda"

define i16 @bswap16(i16 %a) {
; CHECK-LABEL: bswap16(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [bswap16_param_0];
; CHECK-NEXT: shr.u16 %rs2, %rs1, 8;
; CHECK-NEXT: shl.b16 %rs3, %rs1, 8;
; CHECK-NEXT: or.b16 %rs4, %rs3, %rs2;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ld.param.b16 %r1, [bswap16_param_0];
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7701U;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%b = tail call i16 @llvm.bswap.i16(i16 %a)
ret i16 %b
Expand Down Expand Up @@ -56,40 +49,39 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
}

define i64 @bswap64(i64 %a) {
; PTX70-LABEL: bswap64(
; PTX70: {
; PTX70-NEXT: .reg .b32 %r<5>;
; PTX70-NEXT: .reg .b64 %rd<3>;
; PTX70-EMPTY:
; PTX70-NEXT: // %bb.0:
; PTX70-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 0x123U;
; PTX70-NEXT: mov.b64 %rd2, {%r4, %r2};
; PTX70-NEXT: st.param.b64 [func_retval0], %rd2;
; PTX70-NEXT: ret;
;
; PTX71-LABEL: bswap64(
; PTX71: {
; PTX71-NEXT: .reg .b32 %r<5>;
; PTX71-NEXT: .reg .b64 %rd<3>;
; PTX71-EMPTY:
; PTX71-NEXT: // %bb.0:
; PTX71-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
; PTX71-NEXT: mov.b64 {%r1, _}, %rd1;
; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
; PTX71-NEXT: mov.b64 {_, %r3}, %rd1;
; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 0x123U;
; PTX71-NEXT: mov.b64 %rd2, {%r4, %r2};
; PTX71-NEXT: st.param.b64 [func_retval0], %rd2;
; PTX71-NEXT: ret;
; CHECK-LABEL: bswap64(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x123U;
; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x123U;
; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3};
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
; CHECK-NEXT: ret;
%b = tail call i64 @llvm.bswap.i64(i64 %a)
ret i64 %b
}

define <2 x i32> @bswapv2i32(<2 x i32> %a) {
; CHECK-LABEL: bswapv2i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [bswapv2i32_param_0];
; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x123U;
; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x123U;
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
; CHECK-NEXT: ret;
%b = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
ret <2 x i32> %b
}
declare i16 @llvm.bswap.i16(i16)
declare i32 @llvm.bswap.i32(i32)
declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
declare i64 @llvm.bswap.i64(i64)
declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)