Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 151 additions & 15 deletions clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {

void buildCUDAModuleCtor();
std::optional<FuncOp> buildCUDAModuleDtor();
std::optional<FuncOp> buildHIPModuleDtor();
std::optional<FuncOp> buildCUDARegisterGlobals();

void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder,
Expand Down Expand Up @@ -1046,8 +1047,11 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
std::move(cudaGPUBinaryOrErr.get());

// The section names are different for MAC OS X.
llvm::StringRef fatbinConstName = ".nv_fatbin";
llvm::StringRef fatbinSectionName = ".nvFatBinSegment";
llvm::StringRef fatbinConstName =
astCtx->getLangOpts().HIP ? ".hip_fatbin" : ".nv_fatbin";

llvm::StringRef fatbinSectionName =
astCtx->getLangOpts().HIP ? ".hipFatBinSegment" : ".nvFatBinSegment";

// Create a global variable with the contents of GPU binary.
auto fatbinType =
Expand Down Expand Up @@ -1119,7 +1123,66 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
globalCtorList.emplace_back(moduleCtorName,
cir::DefaultGlobalCtorDtorPriority);
builder.setInsertionPointToStart(moduleCtor.addEntryBlock());
if (astCtx->getLangOpts().HIP) {
auto *entryBlock = builder.getInsertionBlock();
auto *parent = builder.getInsertionBlock()->getParent();
auto *ifBlock = builder.createBlock(parent);
auto *exitBlock = builder.createBlock(parent);
{
mlir::OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointToEnd(entryBlock);
mlir::Value handle =
builder.createLoad(loc, builder.createGetGlobal(gpubinHandle));
auto handlePtrTy = llvm::cast<cir::PointerType>(handle.getType());
mlir::Value nullPtr = builder.getNullPtr(handlePtrTy, loc);
auto isNull =
builder.createCompare(loc, cir::CmpOpKind::eq, handle, nullPtr);

builder.create<cir::BrCondOp>(loc, isNull, ifBlock, exitBlock);
}
{
// When handle is null we need to load the fatbin and register it
mlir::OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointToStart(ifBlock);
auto wrapper = builder.createGetGlobal(fatbinWrapper);
auto fatbinVoidPtr = builder.createBitcast(wrapper, voidPtrTy);
auto gpuBinaryHandleCall =
builder.createCallOp(loc, regFunc, fatbinVoidPtr);
auto gpuBinaryHandle = gpuBinaryHandleCall.getResult();
// Store the value back to the global `__cuda_gpubin_handle`.
auto gpuBinaryHandleGlobal = builder.createGetGlobal(gpubinHandle);
builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal);
builder.create<cir::BrOp>(loc, exitBlock);
}
{
// Exit block
mlir::OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointToStart(exitBlock);
mlir::Value gHandle =
builder.createLoad(loc, builder.createGetGlobal(gpubinHandle));

std::optional<FuncOp> regGlobal = buildCUDARegisterGlobals();
if (regGlobal) {
builder.createCallOp(loc, *regGlobal, gHandle);
}

if (auto dtor = buildHIPModuleDtor()) {
cir::CIRBaseBuilderTy globalBuilder(getContext());
globalBuilder.setInsertionPointToStart(theModule.getBody());
FuncOp atexit = buildRuntimeFunction(
globalBuilder, "atexit", loc,
FuncType::get(PointerType::get(dtor->getFunctionType()), intTy));

mlir::Value dtorFunc = GetGlobalOp::create(
builder, loc, PointerType::get(dtor->getFunctionType()),
mlir::FlatSymbolRefAttr::get(dtor->getSymNameAttr()));
builder.createCallOp(loc, atexit, dtorFunc);
}
cir::ReturnOp::create(builder, loc);
}
return;
}
// CUDA CTOR-DTOR generations
// Register binary with CUDA runtime. This is substantially different in
// default mode vs. separate compilation.
// Corresponding code:
Expand Down Expand Up @@ -1243,9 +1306,10 @@ void LoweringPreparePass::buildCUDARegisterGlobalFunctions(
auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp {
auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size());

auto tmpString = GlobalOp::create(
globalBuilder, loc, (".str" + str).str(), strType, /*isConstant=*/true,
/*linkage=*/cir::GlobalLinkageKind::PrivateLinkage);
auto tmpString =
GlobalOp::create(globalBuilder, loc, (".str" + str).str(), strType,
/*isConstant=*/true,
/*linkage=*/cir::GlobalLinkageKind::PrivateLinkage);

// We must make the string zero-terminated.
tmpString.setInitialValueAttr(ConstArrayAttr::get(
Expand All @@ -1260,19 +1324,91 @@ void LoweringPreparePass::buildCUDARegisterGlobalFunctions(
GlobalOp deviceFuncStr = makeConstantString(kernelName);
mlir::Value deviceFunc = builder.createBitcast(
builder.createGetGlobal(deviceFuncStr), voidPtrTy);
mlir::Value hostFunc = builder.createBitcast(
GetGlobalOp::create(
builder, loc, PointerType::get(deviceStub.getFunctionType()),
mlir::FlatSymbolRefAttr::get(deviceStub.getSymNameAttr())),
voidPtrTy);
builder.createCallOp(
loc, cudaRegisterFunction,
{fatbinHandle, hostFunc, deviceFunc, deviceFunc,
ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)), cirNullPtr,
cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr});
if (astCtx->getLangOpts().HIP) {
auto funcHandle = cast<GlobalOp>(theModule.lookupSymbol(kernelName));
mlir::Value hostFunc =
builder.createBitcast(builder.createGetGlobal(funcHandle), voidPtrTy);
builder.createCallOp(
loc, cudaRegisterFunction,
{fatbinHandle, hostFunc, deviceFunc, deviceFunc,
ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)),
cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr});

} else {
mlir::Value hostFunc = builder.createBitcast(
GetGlobalOp::create(
builder, loc, PointerType::get(deviceStub.getFunctionType()),
mlir::FlatSymbolRefAttr::get(deviceStub.getSymNameAttr())),
voidPtrTy);
builder.createCallOp(
loc, cudaRegisterFunction,
{fatbinHandle, hostFunc, deviceFunc, deviceFunc,
ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)),
cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr});
}
}
}

std::optional<FuncOp> LoweringPreparePass::buildHIPModuleDtor() {
if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName()))
return {};

std::string prefix = getCUDAPrefix(astCtx);

auto voidTy = VoidType::get(&getContext());
auto voidPtrPtrTy = PointerType::get(PointerType::get(voidTy));

auto loc = theModule.getLoc();

cir::CIRBaseBuilderTy builder(getContext());
builder.setInsertionPointToStart(theModule.getBody());

// void __hipUnregisterFatBinary(void ** andle);
std::string unregisterFuncName =
addUnderscoredPrefix(prefix, "UnregisterFatBinary");
FuncOp unregisterFunc = buildRuntimeFunction(
builder, unregisterFuncName, loc, FuncType::get({voidPtrPtrTy}, voidTy));

// void __hip_module_dtor();
// Despite the name, OG doesn't treat it as a destructor, so it shouldn't be
// put into globalDtorList. If it were a real dtor, then it would cause
// double free. The way to use it is to manually call
// atexit() at end of module ctor.
std::string dtorName = addUnderscoredPrefix(prefix, "_module_dtor");
FuncOp dtor =
buildRuntimeFunction(builder, dtorName, loc, FuncType::get({}, voidTy),
GlobalLinkageKind::InternalLinkage);

std::string gpubinName = addUnderscoredPrefix(prefix, "_gpubin_handle");
auto gpuBinGlobal = cast<GlobalOp>(theModule.lookupSymbol(gpubinName));
auto *entryBlock = dtor.addEntryBlock();
auto *ifBlock = builder.createBlock(&dtor.getBody());
auto *exitBlock = builder.createBlock(&dtor.getBody());
mlir::OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointToEnd(entryBlock);
mlir::Value handle =
builder.createLoad(loc, builder.createGetGlobal(gpuBinGlobal));
auto handlePtrTy = llvm::cast<cir::PointerType>(handle.getType());
mlir::Value nullPtr = builder.getNullPtr(handlePtrTy, loc);
auto isNull = builder.createCompare(loc, cir::CmpOpKind::ne, handle, nullPtr);
builder.create<cir::BrCondOp>(loc, isNull, ifBlock, exitBlock);
{
// When handle is not null we need to unregister it and store null to handle
mlir::OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointToStart(ifBlock);
builder.createCallOp(loc, unregisterFunc, handle);
builder.createStore(loc, nullPtr, builder.createGetGlobal(gpuBinGlobal));
builder.create<cir::BrOp>(loc, exitBlock);
}
{
// Exit block
mlir::OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointToStart(exitBlock);
cir::ReturnOp::create(builder, loc);
}
return dtor;
}

std::optional<FuncOp> LoweringPreparePass::buildCUDAModuleDtor() {
if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName()))
return {};
Expand Down
191 changes: 191 additions & 0 deletions clang/test/CIR/CodeGen/HIP/registration.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#include "cuda.h"

// RUN: echo "sample fatbin" > %t.fatbin
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
// RUN: -x hip -emit-cir -fhip-new-launch-api -I%S/../Inputs/ \
// RUN: -fcuda-include-gpubinary %t.fatbin \
// RUN: %s -o %t.cir
// RUN: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s

// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
// RUN: -x hip -emit-llvm -fhip-new-launch-api -I%S/../Inputs/ \
// RUN: -fcuda-include-gpubinary %t.fatbin \
// RUN: %s -o %t.ll
// RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s

// OGCG emits LLVM IR in different order than clangir, we add at the end the order of OGCG.
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \
// RUN: -x hip -emit-llvm -fhip-new-launch-api -I%S/../Inputs/ \
// RUN: -fcuda-include-gpubinary %t.fatbin \
// RUN: %s -o %t.ll
// RUN: FileCheck --check-prefix=OGCG-HOST --input-file=%t.ll %s


// CIR-HOST: module @"{{.*}}" attributes {
// CIR-HOST: cir.cu.binary_handle = #cir.cu.binary_handle<{{.*}}.fatbin>,
// CIR-HOST: cir.global_ctors = [#cir.global_ctor<"__hip_module_ctor", {{[0-9]+}}>]
// CIR-HOST: }

// LLVM-HOST: @.str_Z2fnv = private constant [7 x i8] c"_Z2fnv\00"
// LLVM-HOST: @__hip_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".hip_fatbin"
// LLVM-HOST: @__hip_fatbin_wrapper = internal constant {
// LLVM-HOST: i32 1212764230, i32 1, ptr @__hip_fatbin_str, ptr null
// LLVM-HOST: }, section ".hipFatBinSegment"
// LLVM-HOST: @_Z2fnv = constant ptr @_Z17__device_stub__fnv, align 8
// LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__hip_module_ctor

// CIR-HOST: cir.func internal private @__hip_module_dtor() {
// CIR-HOST: %[[#HandleGlobal:]] = cir.get_global @__hip_gpubin_handle
// CIR-HOST: %[[#HandleAddr:]] = cir.load %[[#HandleGlobal]] : !cir.ptr<!cir.ptr<!cir.ptr<!void>>>, !cir.ptr<!cir.ptr<!void>> loc(#loc)
// CIR-HOST: %[[#NullVal:]] = cir.const #cir.ptr<null> : !cir.ptr<!cir.ptr<!void>> loc(#loc)
// CIR-HOST: %3 = cir.cmp(ne, %[[#HandleAddr]], %[[#NullVal]]) : !cir.ptr<!cir.ptr<!void>>, !cir.bool loc(#loc)
// CIR-HOST: cir.brcond %3 ^bb1, ^bb2 loc(#loc)
// CIR-HOST: ^bb1:
// CIR-HOST: cir.call @__hipUnregisterFatBinary(%[[#HandleAddr]]) : (!cir.ptr<!cir.ptr<!void>>) -> () loc(#loc)
// CIR-HOST: %[[#HandleAddr:]] = cir.get_global @__hip_gpubin_handle : !cir.ptr<!cir.ptr<!cir.ptr<!void>>> loc(#loc)
// CIR-HOST: cir.store %[[#NullVal]], %[[#HandleAddr]] : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!cir.ptr<!cir.ptr<!void>>> loc(#loc)
// CIR-HOST: cir.br ^bb2 loc(#loc)
// CIR-HOST: ^bb2: // 2 preds: ^bb0, ^bb1
// CIR-HOST: cir.return loc(#loc)
// CIR-HOST: } loc(#loc)

// LLVM-HOST: define internal void @__hip_module_dtor() {
// LLVM-HOST: %[[#LLVMHandleVar:]] = load ptr, ptr @__hip_gpubin_handle, align 8
// LLVM-HOST: %[[#ICMP:]] = icmp ne ptr %[[#LLVMHandleVar]], null
// LLVM-HOST: br i1 %[[#ICMP]], label %[[IFBLOCK:[^,]+]], label %[[EXITBLOCK:[^,]+]]
// LLVM-HOST: [[IFBLOCK]]: ; preds = %0
// LLVM-HOST: call void @__hipUnregisterFatBinary(ptr %[[#LLVMHandleVar]])
// LLVM-HOST: store ptr null, ptr @__hip_gpubin_handle, align 8
// LLVM-HOST: br label %[[EXITBLOCK]]
// LLVM-HOST: [[EXITBLOCK]]: ; preds = %[[IFBLOCK]], %0
// LLVM-HOST: ret void
// LLVM-HOST: }

// CIR-HOST: cir.global "private" constant cir_private @".str_Z2fnv" =
// CIR-HOST-SAME: #cir.const_array<"_Z2fnv", trailing_zeros>

__global__ void fn() {}

// CIR-HOST: cir.func internal private @__hip_register_globals(%[[FatbinHandle:[a-zA-Z0-9]+]]{{.*}}) {
// CIR-HOST: %[[#NULL:]] = cir.const #cir.ptr<null>
// CIR-HOST: %[[#T1:]] = cir.get_global @".str_Z2fnv"
// CIR-HOST: %[[#DeviceFn:]] = cir.cast bitcast %[[#T1]]
// CIR-HOST: %[[#T2:]] = cir.get_global @_Z2fnv
// CIR-HOST: %[[#HostFnHandle:]] = cir.cast bitcast %[[#T2]]
// CIR-HOST: %[[#MinusOne:]] = cir.const #cir.int<-1>
// CIR-HOST: cir.call @__hipRegisterFunction(
// CIR-HOST-SAME: %[[FatbinHandle]],
// CIR-HOST-SAME: %[[#HostFnHandle]],
// CIR-HOST-SAME: %[[#DeviceFn]],
// CIR-HOST-SAME: %[[#DeviceFn]],
// CIR-HOST-SAME: %[[#MinusOne]],
// CIR-HOST-SAME: %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]])
// CIR-HOST: }

// LLVM-HOST: define internal void @__hip_register_globals(ptr %[[#LLVMFatbin:]]) {
// LLVM-HOST: call i32 @__hipRegisterFunction(
// LLVM-HOST-SAME: ptr %[[#LLVMFatbin]],
// LLVM-HOST-SAME: ptr @_Z2fnv,
// LLVM-HOST-SAME: ptr @.str_Z2fnv,
// LLVM-HOST-SAME: ptr @.str_Z2fnv,
// LLVM-HOST-SAME: i32 -1,
// LLVM-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null)
// LLVM-HOST: }

// The content in const array should be the same as echoed above,
// with a trailing line break ('\n', 0x0A).
// CIR-HOST: cir.global "private" constant cir_private @__hip_fatbin_str =
// CIR-HOST-SAME: #cir.const_array<"sample fatbin\0A">
// CIR-HOST-SAME: {{.*}}section = ".hip_fatbin"

// The first value is HIP file head magic number.
// CIR-HOST: cir.global "private" constant internal @__hip_fatbin_wrapper
// CIR-HOST: = #cir.const_record<{
// CIR-HOST: #cir.int<1212764230> : !s32i,
// CIR-HOST: #cir.int<1> : !s32i,
// CIR-HOST: #cir.global_view<@__hip_fatbin_str> : !cir.ptr<!void>,
// CIR-HOST: #cir.ptr<null> : !cir.ptr<!void>
// CIR-HOST: }>
// CIR-HOST-SAME: {{.*}}section = ".hipFatBinSegment"

// CIR-HOST: cir.func internal private @__hip_module_ctor() {
// CIR-HOST: %[[#HandleGlobalVar:]] = cir.get_global @__hip_gpubin_handle : !cir.ptr<!cir.ptr<!cir.ptr<!void>>> loc(#loc)
// CIR-HOST: %[[#HandleAddr:]] = cir.load %[[#HandleGlobalVar]] : !cir.ptr<!cir.ptr<!cir.ptr<!void>>>, !cir.ptr<!cir.ptr<!void>> loc(#loc)
// CIR-HOST: %[[#NullVal:]] = cir.const #cir.ptr<null> : !cir.ptr<!cir.ptr<!void>> loc(#loc)
// CIR-HOST: %[[#ICMP:]] = cir.cmp(eq, %[[#HandleAddr]], %[[#NullVal]]) : !cir.ptr<!cir.ptr<!void>>, !cir.bool loc(#loc)
// CIR-HOST: cir.brcond %[[#ICMP]] ^bb1, ^bb2 loc(#loc)
// CIR-HOST: ^bb1:
// CIR-HOST: %[[#FatBinWrapper:]] = cir.get_global @__hip_fatbin_wrapper : !cir.ptr<!rec_anon_struct> loc(#loc)
// CIR-HOST: %[[#CastGlobalFatBin:]] = cir.cast bitcast %[[#FatBinWrapper]] : !cir.ptr<!rec_anon_struct> -> !cir.ptr<!void> loc(#loc)
// CIR-HOST: %[[#RTVal:]] = cir.call @__hipRegisterFatBinary(%[[#CastGlobalFatBin]]) : (!cir.ptr<!void>) -> !cir.ptr<!cir.ptr<!void>> loc(#loc)
// CIR-HOST: %[[#HandleGlobalVar:]] = cir.get_global @__hip_gpubin_handle : !cir.ptr<!cir.ptr<!cir.ptr<!void>>> loc(#loc)
// CIR-HOST: cir.store %[[#RTVal]], %[[#HandleGlobalVar]] : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!cir.ptr<!cir.ptr<!void>>> loc(#loc)
// CIR-HOST: cir.br ^bb2 loc(#loc)
// CIR-HOST: ^bb2:
// CIR-HOST: %[[#HandleGlobalVar:]] = cir.get_global @__hip_gpubin_handle : !cir.ptr<!cir.ptr<!cir.ptr<!void>>> loc(#loc)
// CIR-HOST: %[[#HandleVal:]] = cir.load %8 : !cir.ptr<!cir.ptr<!cir.ptr<!void>>>, !cir.ptr<!cir.ptr<!void>> loc(#loc)
// CIR-HOST: cir.call @__hip_register_globals(%[[#HandleVal]]) : (!cir.ptr<!cir.ptr<!void>>) -> () loc(#loc)
// CIR-HOST: %[[#DTOR:]] = cir.get_global @__hip_module_dtor : !cir.ptr<!cir.func<()>> loc(#loc)
// CIR-HOST: %11 = cir.call @atexit(%[[#DTOR]]) : (!cir.ptr<!cir.func<()>>) -> !s32i loc(#loc)
// CIR-HOST: cir.return loc(#loc)
// CIR-HOST: } loc(#loc)

// LLVM-HOST: define internal void @__hip_module_ctor() {
// LLVM-HOST: %[[#LLVMHandleVar:]] = load ptr, ptr @__hip_gpubin_handle, align 8
// LLVM-HOST: %[[#ICMP:]] = icmp eq ptr %[[#LLVMHandleVar]], null
// LLVM-HOST: br i1 %[[#ICMP]], label %[[IFBLOCK:[^,]+]], label %[[EXITBLOCK:[^,]+]]
// LLVM-HOST: [[IFBLOCK]]:
// LLVM-HOST: %[[#Value:]] = call ptr @__hipRegisterFatBinary(ptr @__hip_fatbin_wrapper)
// LLVM-HOST: store ptr %[[#Value]], ptr @__hip_gpubin_handle, align 8
// LLVM-HOST: br label %[[EXITBLOCK]]
// LLVM-HOST: [[EXITBLOCK]]:
// LLVM-HOST: %[[#HandleValue:]] = load ptr, ptr @__hip_gpubin_handle, align 8
// LLVM-HOST: call void @__hip_register_globals(ptr %[[#HandleValue]])
// LLVM-HOST: call i32 @atexit(ptr @__hip_module_dtor)
// LLVM-HOST: ret void

// OGCG-HOST: @_Z2fnv = constant ptr @_Z17__device_stub__fnv, align 8
// OGCG-HOST: @0 = private unnamed_addr constant [7 x i8] c"_Z2fnv\00", align 1
// OGCG-HOST: @1 = private constant [14 x i8] c"sample fatbin\0A", section ".hip_fatbin", align 4096
// OGCG-HOST: @__hip_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { i32 1212764230, i32 1, ptr @1, ptr null }, section ".hipFatBinSegment", align 8
// OGCG-HOST: @__hip_gpubin_handle = internal global ptr null, align 8
// OGCG-HOST: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @__hip_module_ctor, ptr null }]

// OGCG-HOST: define internal void @__hip_register_globals(ptr %[[#LLVMFatbin:]]) {
// OGCG-HOST: entry:
// OGCG-HOST: call i32 @__hipRegisterFunction(
// OGCG-HOST-SAME: ptr %[[#LLVMFatbin]],
// OGCG-HOST-SAME: ptr @_Z2fnv,
// OGCG-HOST-SAME: ptr @0,
// OGCG-HOST-SAME: ptr @0,
// OGCG-HOST-SAME: i32 -1,
// OGCG-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null)
// OGCG-HOST: }

// OGCG-HOST: define internal void @__hip_module_ctor() {
// OGCG-HOST: %[[#LLVMHandleVar:]] = load ptr, ptr @__hip_gpubin_handle, align 8
// OGCG-HOST: %[[#ICMP:]] = icmp eq ptr %[[#LLVMHandleVar]], null
// OGCG-HOST: br i1 %[[#ICMP]], label %[[IFBLOCK:[^,]+]], label %[[EXITBLOCK:[^,]+]]
// OGCG-HOST: [[IFBLOCK]]:
// OGCG-HOST: %[[#Value:]] = call ptr @__hipRegisterFatBinary(ptr @__hip_fatbin_wrapper)
// OGCG-HOST: store ptr %[[#Value]], ptr @__hip_gpubin_handle, align 8
// OGCG-HOST: br label %[[EXITBLOCK]]
// OGCG-HOST: [[EXITBLOCK]]:
// OGCG-HOST: %[[#HandleValue:]] = load ptr, ptr @__hip_gpubin_handle, align 8
// OGCG-HOST: call void @__hip_register_globals(ptr %[[#HandleValue]])
// OGCG-HOST: call i32 @atexit(ptr @__hip_module_dtor)
// OGCG-HOST: ret void

// OGCG-HOST: define internal void @__hip_module_dtor() {
// OGCG-HOST: entry:
// OGCG-HOST: %[[#LLVMHandleVar:]] = load ptr, ptr @__hip_gpubin_handle, align 8
// OGCG-HOST: %[[#ICMP:]] = icmp ne ptr %[[#LLVMHandleVar]], null
// OGCG-HOST: br i1 %[[#ICMP]], label %[[IFBLOCK:[^,]+]], label %[[EXITBLOCK:[^,]+]]
// OGCG-HOST: [[IFBLOCK]]: ; preds = %entry
// OGCG-HOST: call void @__hipUnregisterFatBinary(ptr %[[#LLVMHandleVar]])
// OGCG-HOST: store ptr null, ptr @__hip_gpubin_handle, align 8
// OGCG-HOST: br label %[[EXITBLOCK]]
// OGCG-HOST: [[EXITBLOCK]]: ; preds = %[[IFBLOCK]], %entry
// OGCG-HOST: ret void
// OGCG-HOST: }

Loading