[flang][cuda] Carry over the dynamic shared memory size to gpu.launch_func #132837

clementval · 2025-03-24T22:15:03Z

When converting cuf.kernel_launch operation to gpu.launch_func, the dynamic shared memory was always set to zero. Carry over the information from the cuf.kernel_launch op.

…_func

llvmbot · 2025-03-24T22:15:37Z

@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

When converting cuf.kernel_launch operation to gpu.launch_func, the dynamic shared memory was always set to zero. Carry over the information from the cuf.kernel_launch op.

Full diff: https://github.com/llvm/llvm-project/pull/132837.diff

2 Files Affected:

(modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+4-3)
(modified) flang/test/Fir/CUDA/cuda-launch.fir (+4)

diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 0fbec8a204b8d..a01100511ec66 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -810,7 +810,7 @@ struct CUFLaunchOpConversion
                   mlir::PatternRewriter &rewriter) const override {
     mlir::Location loc = op.getLoc();
     auto idxTy = mlir::IndexType::get(op.getContext());
-    auto zero = rewriter.create<mlir::arith::ConstantOp>(
+    mlir::Value zero = rewriter.create<mlir::arith::ConstantOp>(
         loc, rewriter.getIntegerType(32), rewriter.getI32IntegerAttr(0));
     auto gridSizeX =
         rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridX());
@@ -869,10 +869,11 @@ struct CUFLaunchOpConversion
       }
       args.push_back(arg);
     }
-
+    mlir::Value dynamicShmemSize = op.getBytes() ? op.getBytes() : zero;
     auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>(
         loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
-        mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero, args);
+        mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ},
+        dynamicShmemSize, args);
     if (clusterDimX && clusterDimY && clusterDimZ) {
       gpuLaunchOp.getClusterSizeXMutable().assign(clusterDimX);
       gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
index 7833fc7b490bf..b8d79ca06ffd6 100644
--- a/flang/test/Fir/CUDA/cuda-launch.fir
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -23,11 +23,15 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
     // CHECK: %[[ALLOCA:.*]] = fir.alloca f32
     %c1 = arith.constant 1 : index
     %c11_i32 = arith.constant 11 : i32
+    %c1024_i32 = arith.constant 1024 : i32
     %c6_i32 = arith.constant 6 : i32
     %c1_i32 = arith.constant 1 : i32
     // CHECK: gpu.launch_func  @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %c0{{.*}}
     cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>()
 
+    // CHECK: gpu.launch_func  @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %c1024{{.*}}
+    cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1024_i32>>>()
+
     // CHECK: gpu.launch_func  @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref<f32>)
     cuf.kernel_launch @cuda_device_mod::@_QPsub_device2<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%0) : (!fir.ref<f32>)
     return

razvanlupusoru

Nice catch! Thank you!

[flang][cuda] Carry over the dynamic shared memory size to gpu.launch…

5bb2d7b

…_func

clementval requested a review from razvanlupusoru March 24, 2025 22:15

llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir labels Mar 24, 2025

clementval requested a review from wangzpgi March 24, 2025 22:44

razvanlupusoru approved these changes Mar 24, 2025

View reviewed changes

clementval merged commit 5be9082 into llvm:main Mar 25, 2025
14 checks passed

clementval deleted the cuf_dynsmem_conv branch March 25, 2025 01:37

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[flang][cuda] Carry over the dynamic shared memory size to gpu.launch_func #132837

[flang][cuda] Carry over the dynamic shared memory size to gpu.launch_func #132837

Uh oh!

clementval commented Mar 24, 2025

Uh oh!

llvmbot commented Mar 24, 2025

Uh oh!

razvanlupusoru left a comment

Uh oh!

Uh oh!

Uh oh!

[flang][cuda] Carry over the dynamic shared memory size to gpu.launch_func #132837

[flang][cuda] Carry over the dynamic shared memory size to gpu.launch_func #132837

Uh oh!

Conversation

clementval commented Mar 24, 2025

Uh oh!

llvmbot commented Mar 24, 2025

Uh oh!

razvanlupusoru left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!