Skip to content

Commit

Permalink
[NVPTX] Add 'activemask' builtin and intrinsic support (#79768)
Browse files Browse the repository at this point in the history
Summary:
This patch adds support for getting the 'activemask' instruction's value
without needing to use inline assembly. See the relevant PTX reference
for details.


https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
  • Loading branch information
jhuber6 committed Jan 29, 2024
1 parent 3d0a689 commit d492faa
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 6 deletions.
8 changes: 7 additions & 1 deletion clang/include/clang/Basic/BuiltinsNVPTX.def
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#pragma push_macro("PTX42")
#pragma push_macro("PTX60")
#pragma push_macro("PTX61")
#pragma push_macro("PTX62")
#pragma push_macro("PTX63")
#pragma push_macro("PTX64")
#pragma push_macro("PTX65")
Expand Down Expand Up @@ -76,7 +77,8 @@
#define PTX65 "ptx65|" PTX70
#define PTX64 "ptx64|" PTX65
#define PTX63 "ptx63|" PTX64
#define PTX61 "ptx61|" PTX63
#define PTX62 "ptx62|" PTX63
#define PTX61 "ptx61|" PTX62
#define PTX60 "ptx60|" PTX61
#define PTX42 "ptx42|" PTX60

Expand Down Expand Up @@ -632,6 +634,9 @@ TARGET_BUILTIN(__nvvm_vote_any_sync, "bUib", "", PTX60)
TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", PTX60)
TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", PTX60)

// Mask
TARGET_BUILTIN(__nvvm_activemask, "i", "n", PTX62)

// Match
TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", AND(SM_70,PTX60))
TARGET_BUILTIN(__nvvm_match_any_sync_i64, "UiUiWi", "", AND(SM_70,PTX60))
Expand Down Expand Up @@ -1065,6 +1070,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
#pragma pop_macro("PTX42")
#pragma pop_macro("PTX60")
#pragma pop_macro("PTX61")
#pragma pop_macro("PTX62")
#pragma pop_macro("PTX63")
#pragma pop_macro("PTX64")
#pragma pop_macro("PTX65")
Expand Down
16 changes: 12 additions & 4 deletions clang/test/CodeGen/builtins-nvptx.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 \
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 \
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 \
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \
// RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
Expand Down Expand Up @@ -165,6 +165,14 @@ __device__ void sync() {

}

__device__ void activemask() {

// CHECK: call i32 @llvm.nvvm.activemask()

__nvvm_activemask();

}


// NVVM intrinsics

Expand Down
8 changes: 8 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -4599,6 +4599,14 @@ def int_nvvm_vote_ballot_sync :
[IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.vote.ballot.sync">,
ClangBuiltin<"__nvvm_vote_ballot_sync">;

//
// ACTIVEMASK
//
def int_nvvm_activemask :
Intrinsic<[llvm_i32_ty], [],
[IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback, IntrHasSideEffects], "llvm.nvvm.activemask">,
ClangBuiltin<"__nvvm_activemask">;

//
// MATCH.SYNC
//
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/NVPTX/NVPTX.td
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,

def SM90a: FeatureSM<"90a", 901>;

foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 63, 64, 65,
foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83] in
def PTX#version: FeaturePTX<version>;

Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,12 @@ multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntO
Requires<[hasPTX<60>, hasSM<70>]>;
}

// activemask.b32
def ACTIVEMASK : NVPTXInst<(outs Int32Regs:$dest), (ins),
"activemask.b32 \t$dest;",
[(set Int32Regs:$dest, (int_nvvm_activemask))]>,
Requires<[hasPTX<62>, hasSM<30>]>;

defm MATCH_ANY_SYNC_32 : MATCH_ANY_SYNC<Int32Regs, "b32", int_nvvm_match_any_sync_i32,
i32imm>;
defm MATCH_ANY_SYNC_64 : MATCH_ANY_SYNC<Int64Regs, "b64", int_nvvm_match_any_sync_i64,
Expand Down
38 changes: 38 additions & 0 deletions llvm/test/CodeGen/NVPTX/activemask.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
; RUN: llc < %s -march=nvptx64 -O2 -mcpu=sm_52 -mattr=+ptx62 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx62 | %ptxas-verify %}

declare i32 @llvm.nvvm.activemask()

; CHECK-LABEL: activemask(
;
; CHECK: activemask.b32 %[[REG:.+]];
; CHECK-NEXT: st.param.b32 [func_retval0+0], %[[REG]];
; CHECK-NEXT: ret;
define dso_local i32 @activemask() {
entry:
%mask = call i32 @llvm.nvvm.activemask()
ret i32 %mask
}

; CHECK-LABEL: convergent(
;
; CHECK: activemask.b32 %[[REG:.+]];
; CHECK: activemask.b32 %[[REG]];
; CHECK: .param.b32 [func_retval0+0], %[[REG]];
; CHECK-NEXT: ret;
define dso_local i32 @convergent(i1 %cond) {
entry:
br i1 %cond, label %if.else, label %if.then

if.then:
%0 = call i32 @llvm.nvvm.activemask()
br label %if.end

if.else:
%1 = call i32 @llvm.nvvm.activemask()
br label %if.end

if.end:
%mask = phi i32 [ %0, %if.then ], [ %1, %if.else ]
ret i32 %mask
}

0 comments on commit d492faa

Please sign in to comment.