diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index a77eb0240e677..454a237b1be78 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -713,8 +713,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, Custom); } - setOperationAction(ISD::BSWAP, MVT::i16, Expand); - setOperationAction(ISD::BR_JT, MVT::Other, Custom); setOperationAction(ISD::BRIND, MVT::Other, Expand); @@ -1106,6 +1104,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // * MVT::Other - internal.addrspace.wrap setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom); + + // Custom lowering for bswap + setOperationAction(ISD::BSWAP, {MVT::i16, MVT::i32, MVT::i64, MVT::v2i16}, + Custom); } TargetLoweringBase::LegalizeTypeAction @@ -2570,6 +2572,44 @@ static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG) { return Tcgen05StNode; } +static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + EVT VT = Op.getValueType(); + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i16: { + SDValue Extended = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src); + SDValue Swapped = + getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped); + } + case MVT::i32: { + return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG); + } + case MVT::v2i16: { + SDValue Converted = DAG.getBitcast(MVT::i32, Src); + SDValue Swapped = + getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG); + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped); + } + case MVT::i64: { + SDValue UnpackSrc = + DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, Src); + SDValue SwappedLow = + getPRMT(UnpackSrc.getValue(0), DAG.getConstant(0, DL, MVT::i32), 0x0123, + DL, DAG); + SDValue SwappedHigh = + getPRMT(UnpackSrc.getValue(1), DAG.getConstant(0, DL, MVT::i32), 0x0123, + DL, DAG); + return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, + {SwappedHigh, SwappedLow}); + } + default: + llvm_unreachable("unsupported type for bswap"); + } +} + static unsigned getTcgen05MMADisableOutputLane(unsigned IID) { switch (IID) { case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1: @@ -3193,7 +3233,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerCTLZCTPOP(Op, DAG); case ISD::FREM: return lowerFREM(Op, DAG); - + case ISD::BSWAP: + return lowerBSWAP(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 8b129e7e5eeae..04e2dd435cdf0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2468,38 +2468,6 @@ let Predicates = [hasPTX<73>, hasSM<52>] in { include "NVPTXIntrinsics.td" -//----------------------------------- -// Notes -//----------------------------------- -// BSWAP is currently expanded. The following is a more efficient -// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register -// - for sm_20, use pmpt (use vector scalar mov to get the pack and -// unpack). sm_20 supports native 32-bit register, but not native 16-bit -// register. - -def : Pat < - (i32 (bswap i32:$a)), - (PRMT_B32rii $a, (i32 0), (i32 0x0123), PrmtNONE)>; - -def : Pat < - (v2i16 (bswap v2i16:$a)), - (PRMT_B32rii $a, (i32 0), (i32 0x2301), PrmtNONE)>; - -def : Pat < - (i64 (bswap i64:$a)), - (V2I32toI64 - (PRMT_B32rii (I64toI32H_Sink $a), (i32 0), (i32 0x0123), PrmtNONE), - (PRMT_B32rii (I64toI32L_Sink $a), (i32 0), (i32 0x0123), PrmtNONE))>, - Requires<[hasPTX<71>]>; - -// Fall back to the old way if we don't have PTX 7.1. -def : Pat < - (i64 (bswap i64:$a)), - (V2I32toI64 - (PRMT_B32rii (I64toI32H $a), (i32 0), (i32 0x0123), PrmtNONE), - (PRMT_B32rii (I64toI32L $a), (i32 0), (i32 0x0123), PrmtNONE))>; - - //////////////////////////////////////////////////////////////////////////////// // PTX Fence instructions //////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll index e3d1c80922609..8050c6f1c7031 100644 --- a/llvm/test/CodeGen/NVPTX/bswap.ll +++ b/llvm/test/CodeGen/NVPTX/bswap.ll @@ -1,25 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} -; RUN: %if ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %} -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s -; RUN: %if ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" define i16 @bswap16(i16 %a) { ; CHECK-LABEL: bswap16( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b16 %rs1, [bswap16_param_0]; -; CHECK-NEXT: shr.u16 %rs2, %rs1, 8; -; CHECK-NEXT: shl.b16 %rs3, %rs1, 8; -; CHECK-NEXT: or.b16 %rs4, %rs3, %rs2; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ld.param.b16 %r1, [bswap16_param_0]; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7701U; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %b = tail call i16 @llvm.bswap.i16(i16 %a) ret i16 %b @@ -56,40 +49,39 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 { } define i64 @bswap64(i64 %a) { -; PTX70-LABEL: bswap64( -; PTX70: { -; PTX70-NEXT: .reg .b32 %r<5>; -; PTX70-NEXT: .reg .b64 %rd<3>; -; PTX70-EMPTY: -; PTX70-NEXT: // %bb.0: -; PTX70-NEXT: ld.param.b64 %rd1, [bswap64_param_0]; -; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; } -; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 0x123U; -; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; } -; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 0x123U; -; PTX70-NEXT: mov.b64 %rd2, {%r4, %r2}; -; PTX70-NEXT: st.param.b64 [func_retval0], %rd2; -; PTX70-NEXT: ret; -; -; PTX71-LABEL: bswap64( -; PTX71: { -; PTX71-NEXT: .reg .b32 %r<5>; -; PTX71-NEXT: .reg .b64 %rd<3>; -; PTX71-EMPTY: -; PTX71-NEXT: // %bb.0: -; PTX71-NEXT: ld.param.b64 %rd1, [bswap64_param_0]; -; PTX71-NEXT: mov.b64 {%r1, _}, %rd1; -; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 0x123U; -; PTX71-NEXT: mov.b64 {_, %r3}, %rd1; -; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 0x123U; -; PTX71-NEXT: mov.b64 %rd2, {%r4, %r2}; -; PTX71-NEXT: st.param.b64 [func_retval0], %rd2; -; PTX71-NEXT: ret; +; CHECK-LABEL: bswap64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [bswap64_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x123U; +; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x123U; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %b = tail call i64 @llvm.bswap.i64(i64 %a) ret i64 %b } +define <2 x i32> @bswapv2i32(<2 x i32> %a) { +; CHECK-LABEL: bswapv2i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [bswapv2i32_param_0]; +; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x123U; +; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x123U; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %b = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a) + ret <2 x i32> %b +} declare i16 @llvm.bswap.i16(i16) declare i32 @llvm.bswap.i32(i32) declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) declare i64 @llvm.bswap.i64(i64) +declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)