From 3122e7683293ebc9eea4ca2cdf9717c7ae8415eb Mon Sep 17 00:00:00 2001 From: Yue Huang Date: Tue, 11 Nov 2025 17:48:37 +0000 Subject: [PATCH] [CIR][CUDA] Register __device__ global variables --- .../clang/CIR/Dialect/IR/CIRDataLayout.h | 4 + .../Dialect/Transforms/LoweringPrepare.cpp | 93 ++++++++++++++++++- clang/test/CIR/CodeGen/CUDA/registration.cu | 15 +++ 3 files changed, 108 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h index e6ad589f3a57..cc0c398c329a 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h +++ b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h @@ -129,6 +129,10 @@ class CIRDataLayout { mlir::Type getCharType(mlir::MLIRContext *ctx) const { return typeSizeInfo.getCharType(ctx); } + + mlir::Type getSizeType(mlir::MLIRContext *ctx) const { + return typeSizeInfo.getSizeType(ctx); + } }; /// Used to lazily calculate structure layout information for a target machine, diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index a2a04dc2a47d..e0078fd073f6 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -127,6 +127,8 @@ struct LoweringPreparePass : public LoweringPrepareBase { // Maps CUDA kernel name to device stub function. llvm::StringMap cudaKernelMap; + // Maps CUDA device-side variable name to host-side (shadow) GlobalOp. + llvm::StringMap cudaVarMap; void buildCUDAModuleCtor(); std::optional buildCUDAModuleDtor(); @@ -134,6 +136,8 @@ struct LoweringPreparePass : public LoweringPrepareBase { void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder, FuncOp regGlobalFunc); + void buildCUDARegisterVars(cir::CIRBaseBuilderTy &builder, + FuncOp regGlobalFunc); /// /// AST related @@ -1198,8 +1202,7 @@ std::optional LoweringPreparePass::buildCUDARegisterGlobals() { builder.setInsertionPointToStart(regGlobalFunc.addEntryBlock()); buildCUDARegisterGlobalFunctions(builder, regGlobalFunc); - - // TODO(cir): registration for global variables. + buildCUDARegisterVars(builder, regGlobalFunc); ReturnOp::create(builder, loc); return regGlobalFunc; @@ -1273,6 +1276,83 @@ void LoweringPreparePass::buildCUDARegisterGlobalFunctions( } } +void LoweringPreparePass::buildCUDARegisterVars(cir::CIRBaseBuilderTy &builder, + FuncOp regGlobalFunc) { + auto loc = theModule.getLoc(); + auto cudaPrefix = getCUDAPrefix(astCtx); + + auto voidTy = VoidType::get(&getContext()); + auto voidPtrTy = PointerType::get(voidTy); + auto voidPtrPtrTy = PointerType::get(voidPtrTy); + auto intTy = datalayout->getIntType(&getContext()); + auto charTy = datalayout->getCharType(&getContext()); + auto sizeTy = datalayout->getSizeType(&getContext()); + + // Extract the GPU binary handle argument. + mlir::Value fatbinHandle = *regGlobalFunc.args_begin(); + + cir::CIRBaseBuilderTy globalBuilder(getContext()); + globalBuilder.setInsertionPointToStart(theModule.getBody()); + + // Declare CUDA internal function: + // void __cudaRegisterVar( + // void **fatbinHandle, + // char *hostVarName, + // char *deviceVarName, + // const char *deviceVarName, + // int isExtern, size_t varSize, + // int isConstant, int zero + // ); + // Similar to the registration of global functions, OG does not care about + // pointer types. They will generate the same IR anyway. + + FuncOp cudaRegisterVar = buildRuntimeFunction( + globalBuilder, addUnderscoredPrefix(cudaPrefix, "RegisterVar"), loc, + FuncType::get({voidPtrPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, intTy, + sizeTy, intTy, intTy}, + voidTy)); + + unsigned int count = 0; + auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp { + auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size()); + + auto tmpString = GlobalOp::create( + globalBuilder, loc, (".str" + str + std::to_string(count++)).str(), + strType, /*isConstant=*/true, + /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage); + + // We must make the string zero-terminated. + tmpString.setInitialValueAttr(ConstArrayAttr::get( + strType, StringAttr::get(&getContext(), str + "\0"))); + tmpString.setPrivate(); + return tmpString; + }; + + for (auto &[deviceSideName, global] : cudaVarMap) { + GlobalOp deviceNameStr = makeConstantString(deviceSideName); + mlir::Value deviceNameValue = builder.createBitcast( + builder.createGetGlobal(deviceNameStr), voidPtrTy); + + GlobalOp hostNameStr = makeConstantString(global.getName()); + mlir::Value hostNameValue = + builder.createBitcast(builder.createGetGlobal(hostNameStr), voidPtrTy); + + // Every device variable that has a shadow on host will not be extern. + // See CIRGenModule::emitGlobalVarDefinition. + auto isExtern = ConstantOp::create(builder, loc, IntAttr::get(intTy, 0)); + llvm::TypeSize size = datalayout->getTypeSizeInBits(global.getSymType()); + auto varSize = ConstantOp::create( + builder, loc, IntAttr::get(sizeTy, size.getFixedValue() / 8)); + auto isConstant = ConstantOp::create( + builder, loc, IntAttr::get(intTy, global.getConstant())); + auto zero = ConstantOp::create(builder, loc, IntAttr::get(intTy, 0)); + builder.createCallOp(loc, cudaRegisterVar, + {fatbinHandle, hostNameValue, deviceNameValue, + deviceNameValue, isExtern, varSize, isConstant, + zero}); + } +} + std::optional LoweringPreparePass::buildCUDAModuleDtor() { if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName())) return {}; @@ -1585,8 +1665,13 @@ void LoweringPreparePass::runOnOp(Operation *op) { lowerVAArgOp(vaArgOp); } else if (auto deleteArrayOp = dyn_cast(op)) { lowerDeleteArrayOp(deleteArrayOp); - } else if (auto getGlobal = dyn_cast(op)) { - lowerGlobalOp(getGlobal); + } else if (auto global = dyn_cast(op)) { + lowerGlobalOp(global); + if (auto attr = op->getAttr(cir::CUDAShadowNameAttr::getMnemonic())) { + auto shadowNameAttr = dyn_cast(attr); + std::string deviceSideName = shadowNameAttr.getDeviceSideName(); + cudaVarMap[deviceSideName] = global; + } } else if (auto dynamicCast = dyn_cast(op)) { lowerDynamicCastOp(dynamicCast); } else if (auto stdFind = dyn_cast(op)) { diff --git a/clang/test/CIR/CodeGen/CUDA/registration.cu b/clang/test/CIR/CodeGen/CUDA/registration.cu index 834e45204b77..661f916b0f16 100644 --- a/clang/test/CIR/CodeGen/CUDA/registration.cu +++ b/clang/test/CIR/CodeGen/CUDA/registration.cu @@ -50,6 +50,8 @@ __global__ void fn() {} +__device__ int a; + // CIR-HOST: cir.func internal private @__cuda_register_globals(%[[FatbinHandle:[a-zA-Z0-9]+]]{{.*}}) { // CIR-HOST: %[[#NULL:]] = cir.const #cir.ptr // CIR-HOST: %[[#T1:]] = cir.get_global @".str_Z2fnv" @@ -64,6 +66,16 @@ __global__ void fn() {} // CIR-HOST-SAME: %[[#DeviceFn]], // CIR-HOST-SAME: %[[#MinusOne]], // CIR-HOST-SAME: %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]]) +// CIR-HOST: %[[#T3:]] = cir.get_global @".stra0" +// CIR-HOST: %[[#Device:]] = cir.cast bitcast %7 +// CIR-HOST: %[[#T4:]] = cir.get_global @".stra1" +// CIR-HOST: %[[#Host:]] = cir.cast bitcast %9 +// CIR-HOST: %[[#Ext:]] = cir.const #cir.int<0> +// CIR-HOST: %[[#Sz:]] = cir.const #cir.int<4> +// CIR-HOST: %[[#Const:]] = cir.const #cir.int<0> +// CIR-HOST: %[[#Zero:]] = cir.const #cir.int<0> +// CIR-HOST: cir.call @__cudaRegisterVar(%arg0, %[[#Host]], %[[#Device]], %[[#Device]], +// CIR-HOST-SAME: %[[#Ext]], %[[#Sz]], %[[#Const]], %[[#Zero]]) // CIR-HOST: } // LLVM-HOST: define internal void @__cuda_register_globals(ptr %[[#LLVMFatbin:]]) { @@ -74,6 +86,9 @@ __global__ void fn() {} // LLVM-HOST-SAME: ptr @.str_Z2fnv, // LLVM-HOST-SAME: i32 -1, // LLVM-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null) +// LLVM-HOST: call void @__cudaRegisterVar( +// LLVM-HOST-SAME: ptr %0, ptr @.stra1, ptr @.stra0, ptr @.stra0, +// LLVM-HOST-SAME: i32 0, i64 4, i32 0, i32 0) // LLVM-HOST: } // The content in const array should be the same as echoed above,