diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index a2a04dc2a47d..f9b9da07be3b 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -130,6 +130,7 @@ struct LoweringPreparePass : public LoweringPrepareBase { void buildCUDAModuleCtor(); std::optional buildCUDAModuleDtor(); + std::optional buildHIPModuleDtor(); std::optional buildCUDARegisterGlobals(); void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder, @@ -1046,8 +1047,11 @@ void LoweringPreparePass::buildCUDAModuleCtor() { std::move(cudaGPUBinaryOrErr.get()); // The section names are different for MAC OS X. - llvm::StringRef fatbinConstName = ".nv_fatbin"; - llvm::StringRef fatbinSectionName = ".nvFatBinSegment"; + llvm::StringRef fatbinConstName = + astCtx->getLangOpts().HIP ? ".hip_fatbin" : ".nv_fatbin"; + + llvm::StringRef fatbinSectionName = + astCtx->getLangOpts().HIP ? ".hipFatBinSegment" : ".nvFatBinSegment"; // Create a global variable with the contents of GPU binary. auto fatbinType = @@ -1119,7 +1123,66 @@ void LoweringPreparePass::buildCUDAModuleCtor() { globalCtorList.emplace_back(moduleCtorName, cir::DefaultGlobalCtorDtorPriority); builder.setInsertionPointToStart(moduleCtor.addEntryBlock()); + if (astCtx->getLangOpts().HIP) { + auto *entryBlock = builder.getInsertionBlock(); + auto *parent = builder.getInsertionBlock()->getParent(); + auto *ifBlock = builder.createBlock(parent); + auto *exitBlock = builder.createBlock(parent); + { + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToEnd(entryBlock); + mlir::Value handle = + builder.createLoad(loc, builder.createGetGlobal(gpubinHandle)); + auto handlePtrTy = llvm::cast(handle.getType()); + mlir::Value nullPtr = builder.getNullPtr(handlePtrTy, loc); + auto isNull = + builder.createCompare(loc, cir::CmpOpKind::eq, handle, nullPtr); + + builder.create(loc, isNull, ifBlock, exitBlock); + } + { + // When handle is null we need to load the fatbin and register it + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(ifBlock); + auto wrapper = builder.createGetGlobal(fatbinWrapper); + auto fatbinVoidPtr = builder.createBitcast(wrapper, voidPtrTy); + auto gpuBinaryHandleCall = + builder.createCallOp(loc, regFunc, fatbinVoidPtr); + auto gpuBinaryHandle = gpuBinaryHandleCall.getResult(); + // Store the value back to the global `__cuda_gpubin_handle`. + auto gpuBinaryHandleGlobal = builder.createGetGlobal(gpubinHandle); + builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal); + builder.create(loc, exitBlock); + } + { + // Exit block + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(exitBlock); + mlir::Value gHandle = + builder.createLoad(loc, builder.createGetGlobal(gpubinHandle)); + + std::optional regGlobal = buildCUDARegisterGlobals(); + if (regGlobal) { + builder.createCallOp(loc, *regGlobal, gHandle); + } + if (auto dtor = buildHIPModuleDtor()) { + cir::CIRBaseBuilderTy globalBuilder(getContext()); + globalBuilder.setInsertionPointToStart(theModule.getBody()); + FuncOp atexit = buildRuntimeFunction( + globalBuilder, "atexit", loc, + FuncType::get(PointerType::get(dtor->getFunctionType()), intTy)); + + mlir::Value dtorFunc = GetGlobalOp::create( + builder, loc, PointerType::get(dtor->getFunctionType()), + mlir::FlatSymbolRefAttr::get(dtor->getSymNameAttr())); + builder.createCallOp(loc, atexit, dtorFunc); + } + cir::ReturnOp::create(builder, loc); + } + return; + } + // CUDA CTOR-DTOR generations // Register binary with CUDA runtime. This is substantially different in // default mode vs. separate compilation. // Corresponding code: @@ -1243,9 +1306,10 @@ void LoweringPreparePass::buildCUDARegisterGlobalFunctions( auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp { auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size()); - auto tmpString = GlobalOp::create( - globalBuilder, loc, (".str" + str).str(), strType, /*isConstant=*/true, - /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage); + auto tmpString = + GlobalOp::create(globalBuilder, loc, (".str" + str).str(), strType, + /*isConstant=*/true, + /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage); // We must make the string zero-terminated. tmpString.setInitialValueAttr(ConstArrayAttr::get( @@ -1260,19 +1324,91 @@ void LoweringPreparePass::buildCUDARegisterGlobalFunctions( GlobalOp deviceFuncStr = makeConstantString(kernelName); mlir::Value deviceFunc = builder.createBitcast( builder.createGetGlobal(deviceFuncStr), voidPtrTy); - mlir::Value hostFunc = builder.createBitcast( - GetGlobalOp::create( - builder, loc, PointerType::get(deviceStub.getFunctionType()), - mlir::FlatSymbolRefAttr::get(deviceStub.getSymNameAttr())), - voidPtrTy); - builder.createCallOp( - loc, cudaRegisterFunction, - {fatbinHandle, hostFunc, deviceFunc, deviceFunc, - ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)), cirNullPtr, - cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr}); + if (astCtx->getLangOpts().HIP) { + auto funcHandle = cast(theModule.lookupSymbol(kernelName)); + mlir::Value hostFunc = + builder.createBitcast(builder.createGetGlobal(funcHandle), voidPtrTy); + builder.createCallOp( + loc, cudaRegisterFunction, + {fatbinHandle, hostFunc, deviceFunc, deviceFunc, + ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)), + cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr}); + + } else { + mlir::Value hostFunc = builder.createBitcast( + GetGlobalOp::create( + builder, loc, PointerType::get(deviceStub.getFunctionType()), + mlir::FlatSymbolRefAttr::get(deviceStub.getSymNameAttr())), + voidPtrTy); + builder.createCallOp( + loc, cudaRegisterFunction, + {fatbinHandle, hostFunc, deviceFunc, deviceFunc, + ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)), + cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr}); + } } } +std::optional LoweringPreparePass::buildHIPModuleDtor() { + if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName())) + return {}; + + std::string prefix = getCUDAPrefix(astCtx); + + auto voidTy = VoidType::get(&getContext()); + auto voidPtrPtrTy = PointerType::get(PointerType::get(voidTy)); + + auto loc = theModule.getLoc(); + + cir::CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointToStart(theModule.getBody()); + + // void __hipUnregisterFatBinary(void ** andle); + std::string unregisterFuncName = + addUnderscoredPrefix(prefix, "UnregisterFatBinary"); + FuncOp unregisterFunc = buildRuntimeFunction( + builder, unregisterFuncName, loc, FuncType::get({voidPtrPtrTy}, voidTy)); + + // void __hip_module_dtor(); + // Despite the name, OG doesn't treat it as a destructor, so it shouldn't be + // put into globalDtorList. If it were a real dtor, then it would cause + // double free. The way to use it is to manually call + // atexit() at end of module ctor. + std::string dtorName = addUnderscoredPrefix(prefix, "_module_dtor"); + FuncOp dtor = + buildRuntimeFunction(builder, dtorName, loc, FuncType::get({}, voidTy), + GlobalLinkageKind::InternalLinkage); + + std::string gpubinName = addUnderscoredPrefix(prefix, "_gpubin_handle"); + auto gpuBinGlobal = cast(theModule.lookupSymbol(gpubinName)); + auto *entryBlock = dtor.addEntryBlock(); + auto *ifBlock = builder.createBlock(&dtor.getBody()); + auto *exitBlock = builder.createBlock(&dtor.getBody()); + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToEnd(entryBlock); + mlir::Value handle = + builder.createLoad(loc, builder.createGetGlobal(gpuBinGlobal)); + auto handlePtrTy = llvm::cast(handle.getType()); + mlir::Value nullPtr = builder.getNullPtr(handlePtrTy, loc); + auto isNull = builder.createCompare(loc, cir::CmpOpKind::ne, handle, nullPtr); + builder.create(loc, isNull, ifBlock, exitBlock); + { + // When handle is not null we need to unregister it and store null to handle + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(ifBlock); + builder.createCallOp(loc, unregisterFunc, handle); + builder.createStore(loc, nullPtr, builder.createGetGlobal(gpuBinGlobal)); + builder.create(loc, exitBlock); + } + { + // Exit block + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(exitBlock); + cir::ReturnOp::create(builder, loc); + } + return dtor; +} + std::optional LoweringPreparePass::buildCUDAModuleDtor() { if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName())) return {}; diff --git a/clang/test/CIR/CodeGen/HIP/registration.cpp b/clang/test/CIR/CodeGen/HIP/registration.cpp new file mode 100644 index 000000000000..a8294e972909 --- /dev/null +++ b/clang/test/CIR/CodeGen/HIP/registration.cpp @@ -0,0 +1,191 @@ +#include "cuda.h" + +// RUN: echo "sample fatbin" > %t.fatbin +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ +// RUN: -x hip -emit-cir -fhip-new-launch-api -I%S/../Inputs/ \ +// RUN: -fcuda-include-gpubinary %t.fatbin \ +// RUN: %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ +// RUN: -x hip -emit-llvm -fhip-new-launch-api -I%S/../Inputs/ \ +// RUN: -fcuda-include-gpubinary %t.fatbin \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s + +// OGCG emits LLVM IR in different order than clangir, we add at the end the order of OGCG. +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \ +// RUN: -x hip -emit-llvm -fhip-new-launch-api -I%S/../Inputs/ \ +// RUN: -fcuda-include-gpubinary %t.fatbin \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG-HOST --input-file=%t.ll %s + + +// CIR-HOST: module @"{{.*}}" attributes { +// CIR-HOST: cir.cu.binary_handle = #cir.cu.binary_handle<{{.*}}.fatbin>, +// CIR-HOST: cir.global_ctors = [#cir.global_ctor<"__hip_module_ctor", {{[0-9]+}}>] +// CIR-HOST: } + +// LLVM-HOST: @.str_Z2fnv = private constant [7 x i8] c"_Z2fnv\00" +// LLVM-HOST: @__hip_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".hip_fatbin" +// LLVM-HOST: @__hip_fatbin_wrapper = internal constant { +// LLVM-HOST: i32 1212764230, i32 1, ptr @__hip_fatbin_str, ptr null +// LLVM-HOST: }, section ".hipFatBinSegment" +// LLVM-HOST: @_Z2fnv = constant ptr @_Z17__device_stub__fnv, align 8 +// LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__hip_module_ctor + +// CIR-HOST: cir.func internal private @__hip_module_dtor() { +// CIR-HOST: %[[#HandleGlobal:]] = cir.get_global @__hip_gpubin_handle +// CIR-HOST: %[[#HandleAddr:]] = cir.load %[[#HandleGlobal]] : !cir.ptr>>, !cir.ptr> loc(#loc) +// CIR-HOST: %[[#NullVal:]] = cir.const #cir.ptr : !cir.ptr> loc(#loc) +// CIR-HOST: %3 = cir.cmp(ne, %[[#HandleAddr]], %[[#NullVal]]) : !cir.ptr>, !cir.bool loc(#loc) +// CIR-HOST: cir.brcond %3 ^bb1, ^bb2 loc(#loc) +// CIR-HOST: ^bb1: +// CIR-HOST: cir.call @__hipUnregisterFatBinary(%[[#HandleAddr]]) : (!cir.ptr>) -> () loc(#loc) +// CIR-HOST: %[[#HandleAddr:]] = cir.get_global @__hip_gpubin_handle : !cir.ptr>> loc(#loc) +// CIR-HOST: cir.store %[[#NullVal]], %[[#HandleAddr]] : !cir.ptr>, !cir.ptr>> loc(#loc) +// CIR-HOST: cir.br ^bb2 loc(#loc) +// CIR-HOST: ^bb2: // 2 preds: ^bb0, ^bb1 +// CIR-HOST: cir.return loc(#loc) +// CIR-HOST: } loc(#loc) + +// LLVM-HOST: define internal void @__hip_module_dtor() { +// LLVM-HOST: %[[#LLVMHandleVar:]] = load ptr, ptr @__hip_gpubin_handle, align 8 +// LLVM-HOST: %[[#ICMP:]] = icmp ne ptr %[[#LLVMHandleVar]], null +// LLVM-HOST: br i1 %[[#ICMP]], label %[[IFBLOCK:[^,]+]], label %[[EXITBLOCK:[^,]+]] +// LLVM-HOST: [[IFBLOCK]]: ; preds = %0 +// LLVM-HOST: call void @__hipUnregisterFatBinary(ptr %[[#LLVMHandleVar]]) +// LLVM-HOST: store ptr null, ptr @__hip_gpubin_handle, align 8 +// LLVM-HOST: br label %[[EXITBLOCK]] +// LLVM-HOST: [[EXITBLOCK]]: ; preds = %[[IFBLOCK]], %0 +// LLVM-HOST: ret void +// LLVM-HOST: } + +// CIR-HOST: cir.global "private" constant cir_private @".str_Z2fnv" = +// CIR-HOST-SAME: #cir.const_array<"_Z2fnv", trailing_zeros> + +__global__ void fn() {} + +// CIR-HOST: cir.func internal private @__hip_register_globals(%[[FatbinHandle:[a-zA-Z0-9]+]]{{.*}}) { +// CIR-HOST: %[[#NULL:]] = cir.const #cir.ptr +// CIR-HOST: %[[#T1:]] = cir.get_global @".str_Z2fnv" +// CIR-HOST: %[[#DeviceFn:]] = cir.cast bitcast %[[#T1]] +// CIR-HOST: %[[#T2:]] = cir.get_global @_Z2fnv +// CIR-HOST: %[[#HostFnHandle:]] = cir.cast bitcast %[[#T2]] +// CIR-HOST: %[[#MinusOne:]] = cir.const #cir.int<-1> +// CIR-HOST: cir.call @__hipRegisterFunction( +// CIR-HOST-SAME: %[[FatbinHandle]], +// CIR-HOST-SAME: %[[#HostFnHandle]], +// CIR-HOST-SAME: %[[#DeviceFn]], +// CIR-HOST-SAME: %[[#DeviceFn]], +// CIR-HOST-SAME: %[[#MinusOne]], +// CIR-HOST-SAME: %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]]) +// CIR-HOST: } + +// LLVM-HOST: define internal void @__hip_register_globals(ptr %[[#LLVMFatbin:]]) { +// LLVM-HOST: call i32 @__hipRegisterFunction( +// LLVM-HOST-SAME: ptr %[[#LLVMFatbin]], +// LLVM-HOST-SAME: ptr @_Z2fnv, +// LLVM-HOST-SAME: ptr @.str_Z2fnv, +// LLVM-HOST-SAME: ptr @.str_Z2fnv, +// LLVM-HOST-SAME: i32 -1, +// LLVM-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null) +// LLVM-HOST: } + +// The content in const array should be the same as echoed above, +// with a trailing line break ('\n', 0x0A). +// CIR-HOST: cir.global "private" constant cir_private @__hip_fatbin_str = +// CIR-HOST-SAME: #cir.const_array<"sample fatbin\0A"> +// CIR-HOST-SAME: {{.*}}section = ".hip_fatbin" + +// The first value is HIP file head magic number. +// CIR-HOST: cir.global "private" constant internal @__hip_fatbin_wrapper +// CIR-HOST: = #cir.const_record<{ +// CIR-HOST: #cir.int<1212764230> : !s32i, +// CIR-HOST: #cir.int<1> : !s32i, +// CIR-HOST: #cir.global_view<@__hip_fatbin_str> : !cir.ptr, +// CIR-HOST: #cir.ptr : !cir.ptr +// CIR-HOST: }> +// CIR-HOST-SAME: {{.*}}section = ".hipFatBinSegment" + +// CIR-HOST: cir.func internal private @__hip_module_ctor() { +// CIR-HOST: %[[#HandleGlobalVar:]] = cir.get_global @__hip_gpubin_handle : !cir.ptr>> loc(#loc) +// CIR-HOST: %[[#HandleAddr:]] = cir.load %[[#HandleGlobalVar]] : !cir.ptr>>, !cir.ptr> loc(#loc) +// CIR-HOST: %[[#NullVal:]] = cir.const #cir.ptr : !cir.ptr> loc(#loc) +// CIR-HOST: %[[#ICMP:]] = cir.cmp(eq, %[[#HandleAddr]], %[[#NullVal]]) : !cir.ptr>, !cir.bool loc(#loc) +// CIR-HOST: cir.brcond %[[#ICMP]] ^bb1, ^bb2 loc(#loc) +// CIR-HOST: ^bb1: +// CIR-HOST: %[[#FatBinWrapper:]] = cir.get_global @__hip_fatbin_wrapper : !cir.ptr loc(#loc) +// CIR-HOST: %[[#CastGlobalFatBin:]] = cir.cast bitcast %[[#FatBinWrapper]] : !cir.ptr -> !cir.ptr loc(#loc) +// CIR-HOST: %[[#RTVal:]] = cir.call @__hipRegisterFatBinary(%[[#CastGlobalFatBin]]) : (!cir.ptr) -> !cir.ptr> loc(#loc) +// CIR-HOST: %[[#HandleGlobalVar:]] = cir.get_global @__hip_gpubin_handle : !cir.ptr>> loc(#loc) +// CIR-HOST: cir.store %[[#RTVal]], %[[#HandleGlobalVar]] : !cir.ptr>, !cir.ptr>> loc(#loc) +// CIR-HOST: cir.br ^bb2 loc(#loc) +// CIR-HOST: ^bb2: +// CIR-HOST: %[[#HandleGlobalVar:]] = cir.get_global @__hip_gpubin_handle : !cir.ptr>> loc(#loc) +// CIR-HOST: %[[#HandleVal:]] = cir.load %8 : !cir.ptr>>, !cir.ptr> loc(#loc) +// CIR-HOST: cir.call @__hip_register_globals(%[[#HandleVal]]) : (!cir.ptr>) -> () loc(#loc) +// CIR-HOST: %[[#DTOR:]] = cir.get_global @__hip_module_dtor : !cir.ptr> loc(#loc) +// CIR-HOST: %11 = cir.call @atexit(%[[#DTOR]]) : (!cir.ptr>) -> !s32i loc(#loc) +// CIR-HOST: cir.return loc(#loc) +// CIR-HOST: } loc(#loc) + +// LLVM-HOST: define internal void @__hip_module_ctor() { +// LLVM-HOST: %[[#LLVMHandleVar:]] = load ptr, ptr @__hip_gpubin_handle, align 8 +// LLVM-HOST: %[[#ICMP:]] = icmp eq ptr %[[#LLVMHandleVar]], null +// LLVM-HOST: br i1 %[[#ICMP]], label %[[IFBLOCK:[^,]+]], label %[[EXITBLOCK:[^,]+]] +// LLVM-HOST: [[IFBLOCK]]: +// LLVM-HOST: %[[#Value:]] = call ptr @__hipRegisterFatBinary(ptr @__hip_fatbin_wrapper) +// LLVM-HOST: store ptr %[[#Value]], ptr @__hip_gpubin_handle, align 8 +// LLVM-HOST: br label %[[EXITBLOCK]] +// LLVM-HOST: [[EXITBLOCK]]: +// LLVM-HOST: %[[#HandleValue:]] = load ptr, ptr @__hip_gpubin_handle, align 8 +// LLVM-HOST: call void @__hip_register_globals(ptr %[[#HandleValue]]) +// LLVM-HOST: call i32 @atexit(ptr @__hip_module_dtor) +// LLVM-HOST: ret void + +// OGCG-HOST: @_Z2fnv = constant ptr @_Z17__device_stub__fnv, align 8 +// OGCG-HOST: @0 = private unnamed_addr constant [7 x i8] c"_Z2fnv\00", align 1 +// OGCG-HOST: @1 = private constant [14 x i8] c"sample fatbin\0A", section ".hip_fatbin", align 4096 +// OGCG-HOST: @__hip_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { i32 1212764230, i32 1, ptr @1, ptr null }, section ".hipFatBinSegment", align 8 +// OGCG-HOST: @__hip_gpubin_handle = internal global ptr null, align 8 +// OGCG-HOST: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @__hip_module_ctor, ptr null }] + +// OGCG-HOST: define internal void @__hip_register_globals(ptr %[[#LLVMFatbin:]]) { +// OGCG-HOST: entry: +// OGCG-HOST: call i32 @__hipRegisterFunction( +// OGCG-HOST-SAME: ptr %[[#LLVMFatbin]], +// OGCG-HOST-SAME: ptr @_Z2fnv, +// OGCG-HOST-SAME: ptr @0, +// OGCG-HOST-SAME: ptr @0, +// OGCG-HOST-SAME: i32 -1, +// OGCG-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null) +// OGCG-HOST: } + +// OGCG-HOST: define internal void @__hip_module_ctor() { +// OGCG-HOST: %[[#LLVMHandleVar:]] = load ptr, ptr @__hip_gpubin_handle, align 8 +// OGCG-HOST: %[[#ICMP:]] = icmp eq ptr %[[#LLVMHandleVar]], null +// OGCG-HOST: br i1 %[[#ICMP]], label %[[IFBLOCK:[^,]+]], label %[[EXITBLOCK:[^,]+]] +// OGCG-HOST: [[IFBLOCK]]: +// OGCG-HOST: %[[#Value:]] = call ptr @__hipRegisterFatBinary(ptr @__hip_fatbin_wrapper) +// OGCG-HOST: store ptr %[[#Value]], ptr @__hip_gpubin_handle, align 8 +// OGCG-HOST: br label %[[EXITBLOCK]] +// OGCG-HOST: [[EXITBLOCK]]: +// OGCG-HOST: %[[#HandleValue:]] = load ptr, ptr @__hip_gpubin_handle, align 8 +// OGCG-HOST: call void @__hip_register_globals(ptr %[[#HandleValue]]) +// OGCG-HOST: call i32 @atexit(ptr @__hip_module_dtor) +// OGCG-HOST: ret void + +// OGCG-HOST: define internal void @__hip_module_dtor() { +// OGCG-HOST: entry: +// OGCG-HOST: %[[#LLVMHandleVar:]] = load ptr, ptr @__hip_gpubin_handle, align 8 +// OGCG-HOST: %[[#ICMP:]] = icmp ne ptr %[[#LLVMHandleVar]], null +// OGCG-HOST: br i1 %[[#ICMP]], label %[[IFBLOCK:[^,]+]], label %[[EXITBLOCK:[^,]+]] +// OGCG-HOST: [[IFBLOCK]]: ; preds = %entry +// OGCG-HOST: call void @__hipUnregisterFatBinary(ptr %[[#LLVMHandleVar]]) +// OGCG-HOST: store ptr null, ptr @__hip_gpubin_handle, align 8 +// OGCG-HOST: br label %[[EXITBLOCK]] +// OGCG-HOST: [[EXITBLOCK]]: ; preds = %[[IFBLOCK]], %entry +// OGCG-HOST: ret void +// OGCG-HOST: } +