diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 5081a093d4c34..b88978a50ac16 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3453,7 +3453,8 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { } static std::pair -convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG) { +convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG, + const NVPTXSubtarget &STI) { SDValue Chain = N->getOperand(0); SDValue BasePtr = N->getOperand(1); SDValue Mask = N->getOperand(3); @@ -3495,6 +3496,11 @@ convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG) { MemSDNode *NewLD = cast( DAG.getLoad(ResVT, DL, Chain, BasePtr, N->getMemOperand()).getNode()); + // If our subtarget does not support the used bytes mask pragma, "drop" the + // mask by setting it to UINT32_MAX + if (!STI.hasUsedBytesMaskPragma()) + UsedBytesMask = UINT32_MAX; + return {NewLD, UsedBytesMask}; } @@ -3531,7 +3537,8 @@ replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI) { // If we have a masked load, convert it to a normal load now std::optional UsedBytesMask = std::nullopt; if (LD->getOpcode() == ISD::MLOAD) - std::tie(LD, UsedBytesMask) = convertMLOADToLoadWithUsedBytesMask(LD, DAG); + std::tie(LD, UsedBytesMask) = + convertMLOADToLoadWithUsedBytesMask(LD, DAG, STI); // Since LoadV2 is a target node, we cannot rely on DAG type legalization. // Therefore, we must ensure the type is legal. For i1 and i8, we set the @@ -3667,8 +3674,8 @@ SDValue NVPTXTargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const { // them here. EVT VT = Op.getValueType(); if (NVPTX::isPackedVectorTy(VT)) { - auto Result = - convertMLOADToLoadWithUsedBytesMask(cast(Op.getNode()), DAG); + auto Result = convertMLOADToLoadWithUsedBytesMask( + cast(Op.getNode()), DAG, STI); MemSDNode *LD = std::get<0>(Result); uint32_t UsedBytesMask = std::get<1>(Result); diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index f11d331862081..9b9f871549047 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -89,6 +89,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { return SmVersion >= 100 && PTXVersion >= 88 && AS == NVPTXAS::ADDRESS_SPACE_GLOBAL; } + bool hasUsedBytesMaskPragma() const { + return SmVersion >= 50 && PTXVersion >= 83; + } bool hasAtomAddF64() const { return SmVersion >= 60; } bool hasAtomScope() const { return SmVersion >= 60; } bool hasAtomBitwise64() const { return SmVersion >= 32; } diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll index a75ddd032d4c0..19ec2574e32b4 100644 --- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -50,7 +50,6 @@ define half @fh(ptr %p) { ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: ; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0]; -; ENABLED-NEXT: .pragma "used_bytes_mask 0x3ff"; ; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; } ; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2; diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll index 643de006f14c4..4870050dd2d43 100644 --- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll +++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll @@ -171,7 +171,6 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[12]) ; CHECK-LABEL: callee_St4x3( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12] - ; CHECK: .pragma "used_bytes_mask 0xfff"; ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0]; ; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]}; ; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]]; @@ -394,7 +393,6 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by ; CHECK-LABEL: callee_St4x7( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28] ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0]; - ; CHECK: .pragma "used_bytes_mask 0xfff"; ; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16]; ; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; diff --git a/llvm/test/CodeGen/NVPTX/used-bytes-mask.ll b/llvm/test/CodeGen/NVPTX/used-bytes-mask.ll new file mode 100644 index 0000000000000..a888d9996a500 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/used-bytes-mask.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx82 | FileCheck %s -check-prefixes=NOMASK +; RUN: %if ptxas-sm_90 && ptxas-isa-8.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx82 | %ptxas-verify -arch=sm_90 %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck %s -check-prefixes=MASK +; RUN: %if ptxas-sm_90 && ptxas-isa-8.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %} + +; On older architectures and versions, we shouldn't be seeing a used bytes mask pragma. +; Specifically, the pragma is only supported on SM_50 or later, and PTX 8.3 or later. +; Architecture fixed at SM_90 for this test for stability, and we vary the PTX version to test the pragma. + +define i32 @global_8xi32(ptr %a, ptr %b) { +; NOMASK-LABEL: global_8xi32( +; NOMASK: { +; NOMASK-NEXT: .reg .b32 %r<5>; +; NOMASK-NEXT: .reg .b64 %rd<2>; +; NOMASK-EMPTY: +; NOMASK-NEXT: // %bb.0: +; NOMASK-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0]; +; NOMASK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; NOMASK-NEXT: st.param.b32 [func_retval0], %r1; +; NOMASK-NEXT: ret; +; +; MASK-LABEL: global_8xi32( +; MASK: { +; MASK-NEXT: .reg .b32 %r<5>; +; MASK-NEXT: .reg .b64 %rd<2>; +; MASK-EMPTY: +; MASK-NEXT: // %bb.0: +; MASK-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0]; +; MASK-NEXT: .pragma "used_bytes_mask 0xfff"; +; MASK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; MASK-NEXT: st.param.b32 [func_retval0], %r1; +; MASK-NEXT: ret; + %a.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 %a, <4 x i1> , <4 x i32> poison) + %first = extractelement <4 x i32> %a.load, i32 0 + ret i32 %first +} +declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr , <4 x i1>, <4 x i32>)