From c396152627656c1d53fcf115a0d64e3b1c5da87f Mon Sep 17 00:00:00 2001 From: chengjunp Date: Thu, 20 Nov 2025 23:21:59 +0000 Subject: [PATCH 1/5] Use PRMT instruction to lower i16 bswap --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 -- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 4 ++++ llvm/test/CodeGen/NVPTX/bswap.ll | 14 +++++++------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 2f1a7ad2d401f..9de643497ecb4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -711,8 +711,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, Custom); } - setOperationAction(ISD::BSWAP, MVT::i16, Expand); - setOperationAction(ISD::BR_JT, MVT::Other, Custom); setOperationAction(ISD::BRIND, MVT::Other, Expand); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index dfde0cca0f00c..b69aa359cb725 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2455,6 +2455,10 @@ include "NVPTXIntrinsics.td" // unpack). sm_20 supports native 32-bit register, but not native 16-bit // register. +def : Pat < + (i16 (bswap i16:$a)), + (i16 (CVT_u16_u32 (PRMT_B32rii (i32 (CVT_u32_u16 $a, CvtNONE)), (i32 0), (i32 0x0001), PrmtNONE), CvtNONE))>; + def : Pat < (i32 (bswap i32:$a)), (PRMT_B32rii $a, (i32 0), (i32 0x0123), PrmtNONE)>; diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll index e3d1c80922609..a12deed544642 100644 --- a/llvm/test/CodeGen/NVPTX/bswap.ll +++ b/llvm/test/CodeGen/NVPTX/bswap.ll @@ -10,16 +10,16 @@ target triple = "nvptx64-nvidia-cuda" define i16 @bswap16(i16 %a) { ; CHECK-LABEL: bswap16( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [bswap16_param_0]; -; CHECK-NEXT: shr.u16 %rs2, %rs1, 8; -; CHECK-NEXT: shl.b16 %rs3, %rs1, 8; -; CHECK-NEXT: or.b16 %rs4, %rs3, %rs2; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x1U; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r2; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %b = tail call i16 @llvm.bswap.i16(i16 %a) ret i16 %b From 2de19db36471c140404a06234426d101af9ffaed Mon Sep 17 00:00:00 2001 From: chengjunp Date: Fri, 21 Nov 2025 00:30:53 +0000 Subject: [PATCH 2/5] Format --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 2654d3fbef6f6..2bcf4120c7ad2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2477,9 +2477,10 @@ include "NVPTXIntrinsics.td" // unpack). sm_20 supports native 32-bit register, but not native 16-bit // register. -def : Pat < - (i16 (bswap i16:$a)), - (i16 (CVT_u16_u32 (PRMT_B32rii (i32 (CVT_u32_u16 $a, CvtNONE)), (i32 0), (i32 0x0001), PrmtNONE), CvtNONE))>; +def : Pat<(i16 (bswap i16:$a)), + (i16 (CVT_u16_u32 (PRMT_B32rii (i32 (CVT_u32_u16 $a, CvtNONE)), + (i32 0), (i32 0x0001), PrmtNONE), + CvtNONE))>; def : Pat < (i32 (bswap i32:$a)), From adec39963b75953e24cc6a4317fe51356198291a Mon Sep 17 00:00:00 2001 From: chengjunp Date: Fri, 21 Nov 2025 22:44:22 +0000 Subject: [PATCH 3/5] Lowering bswap in operation legalization --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 48 ++++++++++++++++++++- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 35 ++------------- llvm/test/CodeGen/NVPTX/bswap.ll | 16 +++---- 3 files changed, 56 insertions(+), 43 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index cc675ff6ff7c7..67d5f99f958d8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1104,6 +1104,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // * MVT::Other - internal.addrspace.wrap setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom); + + // Custom lowering for bswap + setOperationAction(ISD::BSWAP, MVT::i16, Custom); + setOperationAction(ISD::BSWAP, MVT::i32, Custom); + setOperationAction(ISD::BSWAP, MVT::i64, Custom); + setOperationAction(ISD::BSWAP, MVT::v2i16, Custom); } TargetLoweringBase::LegalizeTypeAction @@ -2568,6 +2574,45 @@ static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG) { return Tcgen05StNode; } +static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + EVT VT = Op.getValueType(); + + if (VT == MVT::i16) { + SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src); + SDValue Swapped = + getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped); + } + + if (VT == MVT::i32) { + return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG); + } + + if (VT == MVT::v2i16) { + SDValue Converted = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); + SDValue Swapped = + getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG); + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped); + } + + if (VT == MVT::i64) { + SDValue Low = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Src, + DAG.getIntPtrConstant(0, DL)); + SDValue High = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Src, + DAG.getIntPtrConstant(1, DL)); + SDValue SwappedLow = + getPRMT(Low, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG); + SDValue SwappedHigh = + getPRMT(High, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG); + return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, + {SwappedHigh, SwappedLow}); + } + + llvm_unreachable("unsupported type for bswap"); +} + static unsigned getTcgen05MMADisableOutputLane(unsigned IID) { switch (IID) { case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1: @@ -3191,7 +3236,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerCTLZCTPOP(Op, DAG); case ISD::FREM: return lowerFREM(Op, DAG); - + case ISD::BSWAP: + return lowerBSWAP(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 2bcf4120c7ad2..68c6e318a8dd7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2471,38 +2471,9 @@ include "NVPTXIntrinsics.td" //----------------------------------- // Notes //----------------------------------- -// BSWAP is currently expanded. The following is a more efficient -// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register -// - for sm_20, use pmpt (use vector scalar mov to get the pack and -// unpack). sm_20 supports native 32-bit register, but not native 16-bit -// register. - -def : Pat<(i16 (bswap i16:$a)), - (i16 (CVT_u16_u32 (PRMT_B32rii (i32 (CVT_u32_u16 $a, CvtNONE)), - (i32 0), (i32 0x0001), PrmtNONE), - CvtNONE))>; - -def : Pat < - (i32 (bswap i32:$a)), - (PRMT_B32rii $a, (i32 0), (i32 0x0123), PrmtNONE)>; - -def : Pat < - (v2i16 (bswap v2i16:$a)), - (PRMT_B32rii $a, (i32 0), (i32 0x2301), PrmtNONE)>; - -def : Pat < - (i64 (bswap i64:$a)), - (V2I32toI64 - (PRMT_B32rii (I64toI32H_Sink $a), (i32 0), (i32 0x0123), PrmtNONE), - (PRMT_B32rii (I64toI32L_Sink $a), (i32 0), (i32 0x0123), PrmtNONE))>, - Requires<[hasPTX<71>]>; - -// Fall back to the old way if we don't have PTX 7.1. -def : Pat < - (i64 (bswap i64:$a)), - (V2I32toI64 - (PRMT_B32rii (I64toI32H $a), (i32 0), (i32 0x0123), PrmtNONE), - (PRMT_B32rii (I64toI32L $a), (i32 0), (i32 0x0123), PrmtNONE))>; +// BSWAP is currently custom-lowered during operation legalization in +// NVPTXISelLowering.cpp. +// See the lowerBSWAP function in NVPTXISelLowering.cpp for details. //////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll index a12deed544642..a0bcf0056651c 100644 --- a/llvm/test/CodeGen/NVPTX/bswap.ll +++ b/llvm/test/CodeGen/NVPTX/bswap.ll @@ -10,16 +10,12 @@ target triple = "nvptx64-nvidia-cuda" define i16 @bswap16(i16 %a) { ; CHECK-LABEL: bswap16( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b16 %rs1, [bswap16_param_0]; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x1U; -; CHECK-NEXT: cvt.u16.u32 %rs2, %r2; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ld.param.b16 %r1, [bswap16_param_0]; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7701U; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %b = tail call i16 @llvm.bswap.i16(i16 %a) ret i16 %b @@ -63,7 +59,7 @@ define i64 @bswap64(i64 %a) { ; PTX70-EMPTY: ; PTX70-NEXT: // %bb.0: ; PTX70-NEXT: ld.param.b64 %rd1, [bswap64_param_0]; -; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; } +; PTX70-NEXT: cvt.u32.u64 %r1, %rd1; ; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 0x123U; ; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; } ; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 0x123U; @@ -78,7 +74,7 @@ define i64 @bswap64(i64 %a) { ; PTX71-EMPTY: ; PTX71-NEXT: // %bb.0: ; PTX71-NEXT: ld.param.b64 %rd1, [bswap64_param_0]; -; PTX71-NEXT: mov.b64 {%r1, _}, %rd1; +; PTX71-NEXT: cvt.u32.u64 %r1, %rd1; ; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 0x123U; ; PTX71-NEXT: mov.b64 {_, %r3}, %rd1; ; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 0x123U; From 7af6fa42404f9cc398aa12bd33cfb4a6ddd3ebc1 Mon Sep 17 00:00:00 2001 From: chengjunp Date: Fri, 21 Nov 2025 23:48:28 +0000 Subject: [PATCH 4/5] Update code & test --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 39 ++++++++--------- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 8 ---- llvm/test/CodeGen/NVPTX/bswap.ll | 47 ++++++--------------- 3 files changed, 32 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 67d5f99f958d8..454a237b1be78 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1106,10 +1106,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom); // Custom lowering for bswap - setOperationAction(ISD::BSWAP, MVT::i16, Custom); - setOperationAction(ISD::BSWAP, MVT::i32, Custom); - setOperationAction(ISD::BSWAP, MVT::i64, Custom); - setOperationAction(ISD::BSWAP, MVT::v2i16, Custom); + setOperationAction(ISD::BSWAP, {MVT::i16, MVT::i32, MVT::i64, MVT::v2i16}, + Custom); } TargetLoweringBase::LegalizeTypeAction @@ -2579,38 +2577,37 @@ static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG) { SDValue Src = Op.getOperand(0); EVT VT = Op.getValueType(); - if (VT == MVT::i16) { - SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src); + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i16: { + SDValue Extended = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src); SDValue Swapped = getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG); return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped); } - - if (VT == MVT::i32) { + case MVT::i32: { return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG); } - - if (VT == MVT::v2i16) { - SDValue Converted = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); + case MVT::v2i16: { + SDValue Converted = DAG.getBitcast(MVT::i32, Src); SDValue Swapped = getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG); return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped); } - - if (VT == MVT::i64) { - SDValue Low = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Src, - DAG.getIntPtrConstant(0, DL)); - SDValue High = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Src, - DAG.getIntPtrConstant(1, DL)); + case MVT::i64: { + SDValue UnpackSrc = + DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, Src); SDValue SwappedLow = - getPRMT(Low, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG); + getPRMT(UnpackSrc.getValue(0), DAG.getConstant(0, DL, MVT::i32), 0x0123, + DL, DAG); SDValue SwappedHigh = - getPRMT(High, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG); + getPRMT(UnpackSrc.getValue(1), DAG.getConstant(0, DL, MVT::i32), 0x0123, + DL, DAG); return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {SwappedHigh, SwappedLow}); } - - llvm_unreachable("unsupported type for bswap"); + default: + llvm_unreachable("unsupported type for bswap"); + } } static unsigned getTcgen05MMADisableOutputLane(unsigned IID) { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 68c6e318a8dd7..04e2dd435cdf0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2468,14 +2468,6 @@ let Predicates = [hasPTX<73>, hasSM<52>] in { include "NVPTXIntrinsics.td" -//----------------------------------- -// Notes -//----------------------------------- -// BSWAP is currently custom-lowered during operation legalization in -// NVPTXISelLowering.cpp. -// See the lowerBSWAP function in NVPTXISelLowering.cpp for details. - - //////////////////////////////////////////////////////////////////////////////// // PTX Fence instructions //////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll index a0bcf0056651c..1e6f95a6201d2 100644 --- a/llvm/test/CodeGen/NVPTX/bswap.ll +++ b/llvm/test/CodeGen/NVPTX/bswap.ll @@ -1,9 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} -; RUN: %if ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %} -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s -; RUN: %if ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" @@ -52,35 +49,19 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 { } define i64 @bswap64(i64 %a) { -; PTX70-LABEL: bswap64( -; PTX70: { -; PTX70-NEXT: .reg .b32 %r<5>; -; PTX70-NEXT: .reg .b64 %rd<3>; -; PTX70-EMPTY: -; PTX70-NEXT: // %bb.0: -; PTX70-NEXT: ld.param.b64 %rd1, [bswap64_param_0]; -; PTX70-NEXT: cvt.u32.u64 %r1, %rd1; -; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 0x123U; -; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; } -; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 0x123U; -; PTX70-NEXT: mov.b64 %rd2, {%r4, %r2}; -; PTX70-NEXT: st.param.b64 [func_retval0], %rd2; -; PTX70-NEXT: ret; -; -; PTX71-LABEL: bswap64( -; PTX71: { -; PTX71-NEXT: .reg .b32 %r<5>; -; PTX71-NEXT: .reg .b64 %rd<3>; -; PTX71-EMPTY: -; PTX71-NEXT: // %bb.0: -; PTX71-NEXT: ld.param.b64 %rd1, [bswap64_param_0]; -; PTX71-NEXT: cvt.u32.u64 %r1, %rd1; -; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 0x123U; -; PTX71-NEXT: mov.b64 {_, %r3}, %rd1; -; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 0x123U; -; PTX71-NEXT: mov.b64 %rd2, {%r4, %r2}; -; PTX71-NEXT: st.param.b64 [func_retval0], %rd2; -; PTX71-NEXT: ret; +; CHECK-LABEL: bswap64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [bswap64_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x123U; +; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x123U; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %b = tail call i64 @llvm.bswap.i64(i64 %a) ret i64 %b } From cabb52cd46ee833760d57d26785d4803e8a58ba3 Mon Sep 17 00:00:00 2001 From: chengjunp Date: Sat, 22 Nov 2025 00:32:13 +0000 Subject: [PATCH 5/5] Add v2i32 test --- llvm/test/CodeGen/NVPTX/bswap.ll | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll index 1e6f95a6201d2..8050c6f1c7031 100644 --- a/llvm/test/CodeGen/NVPTX/bswap.ll +++ b/llvm/test/CodeGen/NVPTX/bswap.ll @@ -66,7 +66,22 @@ define i64 @bswap64(i64 %a) { ret i64 %b } +define <2 x i32> @bswapv2i32(<2 x i32> %a) { +; CHECK-LABEL: bswapv2i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [bswapv2i32_param_0]; +; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x123U; +; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x123U; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %b = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a) + ret <2 x i32> %b +} declare i16 @llvm.bswap.i16(i16) declare i32 @llvm.bswap.i32(i32) declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) declare i64 @llvm.bswap.i64(i64) +declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)