diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 9d707250d11d9..a746beae8d9c2 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -12,6 +12,7 @@ #include "flang/Optimizer/CodeGen/CodeGen.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/CodeGen/CodeGenOpenMP.h" #include "flang/Optimizer/CodeGen/FIROpPatterns.h" #include "flang/Optimizer/CodeGen/LLVMInsertChainFolder.h" @@ -1846,6 +1847,15 @@ struct EmboxOpConversion : public EmboxCommonConversion { }; static bool isDeviceAllocation(mlir::Value val, mlir::Value adaptorVal) { + // Check if the global symbol is in the device module. + if (auto addr = mlir::dyn_cast_or_null(val.getDefiningOp())) + if (auto gpuMod = + addr->getParentOfType() + .lookupSymbol(cudaDeviceModuleName)) + if (gpuMod.lookupSymbol(addr.getSymbol()) || + gpuMod.lookupSymbol(addr.getSymbol())) + return true; + if (auto loadOp = mlir::dyn_cast_or_null(val.getDefiningOp())) return isDeviceAllocation(loadOp.getMemref(), {}); if (auto boxAddrOp = diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir index 672be13beae24..632f8afebbb92 100644 --- a/flang/test/Fir/CUDA/cuda-code-gen.mlir +++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir @@ -221,3 +221,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : func.func private @__tgt_acc_get_deviceptr() -> !fir.ref> } + +// ----- + +module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} { + fir.global @_QMm1Eda {data_attr = #cuf.cuda} : !fir.box>> { + %c0 = arith.constant 0 : index + %0 = fir.zero_bits !fir.heap> + %1 = fircg.ext_embox %0(%c0, %c0) {allocator_idx = 2 : i32} : (!fir.heap>, index, index) -> !fir.box>> + fir.has_value %1 : !fir.box>> + } + func.func @_QQmain() attributes {fir.bindc_name = "P", target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+mmx", "+sse", "+sse2", "+cx8", "+x87", "+fxsr"]>} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0_i32 = arith.constant 0 : i32 + %0 = fir.address_of(@_QMm1Eda) : !fir.ref>>> + %8 = fir.load %0 : !fir.ref>>> + %9 = fircg.ext_rebox %8 : (!fir.box>>) -> !fir.box> + gpu.launch_func @cuda_device_mod::@_QMm1Psub2 blocks in (%c1, %c1, %c1) threads in (%c64, %c1, %c1) dynamic_shared_memory_size %c0_i32 args(%9 : !fir.box>) {cuf.proc_attr = #cuf.cuda_proc} + return + } + gpu.module @cuda_device_mod [#nvvm.target] attributes {llvm.data_layout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"} { + fir.global @_QMm1Eda {data_attr = #cuf.cuda} : !fir.box>> { + %c0 = arith.constant 0 : index + %0 = fir.zero_bits !fir.heap> + %1 = fircg.ext_embox %0(%c0, %c0) {allocator_idx = 2 : i32} : (!fir.heap>, index, index) -> !fir.box>> + fir.has_value %1 : !fir.box>> + } + gpu.func @_QMm1Psub2(%arg0: !fir.box>) kernel { + gpu.return + } + } +} + +// CHECK-LABEL: llvm.func @_QQmain() +// CHECK: llvm.call @_FortranACUFAllocDescriptor