From 1db01a2bda0d3704ef273ec665d4be98e4c9472d Mon Sep 17 00:00:00 2001
From: Asher Mancinelli <ashermancinelli@gmail.com>
Date: Thu, 16 Oct 2025 15:42:54 -0700
Subject: [PATCH 1/4] [mlir][python] Add tests for gpu.launch(_func) ops

These are the tests I wish I could have referred to during development.
Also corrected some small documentation mistakes.
---
 mlir/docs/Dialects/GPU.md                  |  2 +-
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td |  2 +-
 mlir/test/python/dialects/gpu/dialect.py   | 99 +++++++++++++++++++++-
 3 files changed, 100 insertions(+), 3 deletions(-)

diff --git a/mlir/docs/Dialects/GPU.md b/mlir/docs/Dialects/GPU.md
index 8d4d2ca3e5743..c16ed57737e5b 100644
--- a/mlir/docs/Dialects/GPU.md
+++ b/mlir/docs/Dialects/GPU.md
@@ -121,7 +121,7 @@ func.func @main() {
     gpu.launch
         blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1)
         threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) {
-        gpu.printf "Hello from %d\n" %6 : index
+        gpu.printf "Hello from %d\n", %6 : index
         gpu.terminator
     }
     return
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 987fc13e0508d..a6c6038e1e224 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -584,7 +584,7 @@ def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]>
     This operation provides a memref pointer to the start of dynamic shared
     memory, often referred to as workgroup memory. It's important to note that
     this dynamic shared memory needs to be allocated at kernel launch. One can
-    conveniently utilize `the dynamic_shared_memory_size` parameter of
+    conveniently utilize the `dynamic_shared_memory_size` parameter of
     `gpu.launch` for this purpose.
 
     Examples:
diff --git a/mlir/test/python/dialects/gpu/dialect.py b/mlir/test/python/dialects/gpu/dialect.py
index 66c401886804c..24f20d109b3d0 100644
--- a/mlir/test/python/dialects/gpu/dialect.py
+++ b/mlir/test/python/dialects/gpu/dialect.py
@@ -2,7 +2,8 @@
 
 from mlir.ir import *
 import mlir.ir as ir
-import mlir.dialects.gpu as gpu
+from mlir.dialects import gpu, func, arith, math
+from mlir.extras import types as T
 import mlir.dialects.gpu.passes
 from mlir.passmanager import *
 
@@ -157,3 +158,99 @@ def builder(func: gpu.GPUFuncOp) -> None:
     # CHECK:   %[[VAL_0:.*]] = gpu.global_id  x
     # CHECK:   gpu.return
     # CHECK: }
+
+# CHECK-LABEL: testGPULaunchFuncOp
+@run
+def testGPULaunchFuncOp():
+    module = Module.create()
+
+    module.operation.attributes["gpu.container_module"] = UnitAttr.get()
+    with InsertionPoint(module.body):
+        gpu_module = gpu.GPUModuleOp("gpu_module")
+        block = gpu_module.bodyRegion.blocks.append()
+
+    with InsertionPoint(block):
+        gpu_func = gpu.GPUFuncOp(
+            FunctionType.get([], []),
+            "kernel",
+            body_builder=lambda func: gpu.return_([]),
+            kernel=True,
+        )
+
+    with InsertionPoint(module.body):
+        host = func.FuncOp(type=FunctionType.get([], []), name="host")
+
+    with InsertionPoint(host.add_entry_block()):
+        c1 = arith.constant(T.index(), 1)
+        grid_sizes = [c1] * 3
+        block_sizes = [c1] * 3
+        sym_ref = SymbolRefAttr.get([gpu_module.sym_name.value, gpu_func.name.value])
+        token_type = Type.parse("!gpu.async.token")
+        token = gpu.wait(async_token=token_type, async_dependencies=[])
+        token = gpu.launch_func(
+            async_token=token_type,
+            async_dependencies=[token],
+            kernel=sym_ref,
+            grid_size_x=grid_sizes[0],
+            grid_size_y=grid_sizes[1],
+            grid_size_z=grid_sizes[2],
+            block_size_x=block_sizes[0],
+            block_size_y=block_sizes[1],
+            block_size_z=block_sizes[2],
+            kernel_operands=[],
+        )
+        gpu.wait(async_token=None, async_dependencies=[token])
+        func.ReturnOp([])
+
+    print(module)
+
+    # CHECK-LABEL:   gpu.module @gpu_module {
+    # CHECK:           gpu.func @kernel() kernel {
+    # CHECK:             gpu.return
+    # CHECK:           }
+    # CHECK:         }
+
+    # CHECK-LABEL:   func.func @host() {
+    # CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
+    # CHECK:           %[[WAIT_0:.*]] = gpu.wait async
+    # CHECK:           %[[LAUNCH_FUNC_0:.*]] = gpu.launch_func async {{\[}}%[[WAIT_0]]] @gpu_module::@kernel blocks in (%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]]) threads in (%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]])
+    # CHECK:           gpu.wait {{\[}}%[[LAUNCH_FUNC_0]]]
+    # CHECK:           return
+    # CHECK:         }
+
+
+# CHECK-LABEL: testGPULaunchOp
+@run
+def testGPULaunchOp():
+    module = Module.create()
+
+    with InsertionPoint(module.body):
+        host = func.FuncOp(type=FunctionType.get([T.f32()], []), name="gpu_printf")
+
+    entry_block = host.add_entry_block()
+    with InsertionPoint(entry_block):
+        c1 = arith.constant(T.index(), 1)
+
+        launch = gpu.launch(None, [], c1, c1, c1, c1, c1, c1)
+        launch_block = launch.regions[0].blocks.append()
+        for _ in range(12):
+            launch_block.add_argument(T.index(), Location.unknown())
+
+    with InsertionPoint(launch_block):
+        gpu.printf("%f", [entry_block.arguments[0]])
+        gpu.terminator()
+
+    with InsertionPoint(entry_block):
+        func.ReturnOp([])
+
+    print(module)
+
+    # CHECK-LABEL:   func.func @gpu_printf(
+    # CHECK-SAME:      %[[ARG0:.*]]: f32) {
+    # CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
+    # CHECK:           gpu.launch blocks(%[[VAL_0:.*]], %[[VAL_1:.*]], %[[VAL_2:.*]]) in (%[[VAL_3:.*]] = %[[CONSTANT_0]], %[[VAL_4:.*]] = %[[CONSTANT_0]], %[[VAL_5:.*]] = %[[CONSTANT_0]]) threads(%[[VAL_6:.*]], %[[VAL_7:.*]], %[[VAL_8:.*]]) in (%[[VAL_9:.*]] = %[[CONSTANT_0]], %[[VAL_10:.*]] = %[[CONSTANT_0]], %[[VAL_11:.*]] = %[[CONSTANT_0]]) {
+    # CHECK:             gpu.printf "%[[VAL_12:.*]]", %[[ARG0]] : f32
+    # CHECK:             gpu.terminator
+    # CHECK:           }
+    # CHECK:           return
+    # CHECK:         }

From 6fecd4733d124fcf69904f2b13da36c79b36b5fd Mon Sep 17 00:00:00 2001
From: Asher Mancinelli <ashermancinelli@gmail.com>
Date: Thu, 16 Oct 2025 15:54:01 -0700
Subject: [PATCH 2/4] format

---
 mlir/test/python/dialects/gpu/dialect.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/test/python/dialects/gpu/dialect.py b/mlir/test/python/dialects/gpu/dialect.py
index 24f20d109b3d0..75855bc927ca5 100644
--- a/mlir/test/python/dialects/gpu/dialect.py
+++ b/mlir/test/python/dialects/gpu/dialect.py
@@ -159,6 +159,7 @@ def builder(func: gpu.GPUFuncOp) -> None:
     # CHECK:   gpu.return
     # CHECK: }
 
+
 # CHECK-LABEL: testGPULaunchFuncOp
 @run
 def testGPULaunchFuncOp():

From 7e5056dd7e3098eca11516826345823373f3faea Mon Sep 17 00:00:00 2001
From: Asher Mancinelli <ashermancinelli@gmail.com>
Date: Thu, 16 Oct 2025 18:29:16 -0700
Subject: [PATCH 3/4] Adapt some GPU utils from mlir-python-extras

---
 mlir/python/mlir/dialects/gpu/__init__.py | 184 +++++++++++++++++++++-
 mlir/test/python/dialects/gpu/dialect.py  |  44 +++---
 2 files changed, 203 insertions(+), 25 deletions(-)

diff --git a/mlir/python/mlir/dialects/gpu/__init__.py b/mlir/python/mlir/dialects/gpu/__init__.py
index b14ea68938160..18ae52736dd2e 100644
--- a/mlir/python/mlir/dialects/gpu/__init__.py
+++ b/mlir/python/mlir/dialects/gpu/__init__.py
@@ -6,7 +6,7 @@
 from .._gpu_ops_gen import _Dialect
 from .._gpu_enum_gen import *
 from ..._mlir_libs._mlirDialectsGPU import *
-from typing import Callable, Sequence, Union, Optional, List
+from typing import Any, Callable, Sequence, Tuple, Union, Optional, List
 
 try:
     from ...ir import (
@@ -21,15 +21,24 @@
         DictAttr,
         Attribute,
         DenseI32ArrayAttr,
+        Value,
     )
+    from ...extras.meta import region_op
+    from ...extras import types as T
+    from ..arith import constant, ConstantOp
     from .._ods_common import (
         get_default_loc_context as _get_default_loc_context,
         _cext as _ods_cext,
+        get_op_result_or_op_results,
     )
 except ImportError as e:
     raise RuntimeError("Error loading imports from extension module") from e
 
 
+def gpu_async_token():
+    return Type.parse("!gpu.async.token")
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class GPUFuncOp(GPUFuncOp):
     __doc__ = GPUFuncOp.__doc__
@@ -151,3 +160,176 @@ def entry_block(self) -> Block:
     @property
     def arguments(self) -> Sequence[Type]:
         return self.function_type.value.inputs
+
+
+def _convert_literal_to_constant(value: int | ConstantOp) -> Value:
+    if isinstance(value, int):
+        return constant(T.index(), value)
+    elif isinstance(value, (ConstantOp, Value)):
+        return value
+    else:
+        raise ValueError(f"Invalid value: {value}")
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class LaunchFuncOp(LaunchFuncOp):
+    __doc__ = LaunchFuncOp.__doc__
+
+    def __init__(
+        self,
+        kernel: List[str],
+        grid_size: Tuple[Any, Any, Any],
+        block_size: Tuple[Any, Any, Any],
+        kernel_operands: Optional[List[Value]] = None,
+        async_dependencies: Optional[List[Value]] = None,
+        dynamic_shared_memory_size: Optional[Value] = None,
+        async_object=None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if async_dependencies is None:
+            async_dependencies = []
+        async_token = None
+        if len(async_dependencies):
+            async_token = gpu_async_token()
+
+        grid_size_x, grid_size_y, grid_size_z = map(
+            _convert_literal_to_constant, grid_size
+        )
+        block_size_x, block_size_y, block_size_z = map(
+            _convert_literal_to_constant, block_size
+        )
+
+        super().__init__(
+            async_token,
+            async_dependencies,
+            kernel,
+            grid_size_x,
+            grid_size_y,
+            grid_size_z,
+            block_size_x,
+            block_size_y,
+            block_size_z,
+            kernel_operands,
+            dynamicSharedMemorySize=dynamic_shared_memory_size,
+            asyncObject=async_object,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def launch_func(
+    kernel: List[str],
+    grid_size: Tuple[Any, Any, Any],
+    block_size: Tuple[Any, Any, Any],
+    kernel_operands: Optional[List[Value]] = None,
+    async_dependencies: Optional[List[Value]] = None,
+    dynamic_shared_memory_size: Optional[Value] = None,
+    async_object=None,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[Value, List[Value], LaunchFuncOp]:
+    op = LaunchFuncOp(
+        kernel=kernel,
+        grid_size=grid_size,
+        block_size=block_size,
+        kernel_operands=kernel_operands,
+        async_dependencies=async_dependencies,
+        dynamic_shared_memory_size=dynamic_shared_memory_size,
+        async_object=async_object,
+        loc=loc,
+        ip=ip,
+    )
+    results = op.results
+    if len(results) == 1:
+        return results[0]
+    elif len(results) > 1:
+        return results
+    else:
+        return op
+
+
+def wait(
+    async_dependencies: Optional[List[Value]] = None, *, loc=None, ip=None
+) -> Union[Value, List[Value], WaitOp]:
+    if async_dependencies is None:
+        async_dependencies = []
+    return get_op_result_or_op_results(
+        WaitOp(gpu_async_token(), async_dependencies, loc=loc, ip=ip)
+    )
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class LaunchOp(LaunchOp):
+    __doc__ = LaunchOp.__doc__
+
+    def __init__(
+        self,
+        grid_size: Tuple[Any, Any, Any],
+        block_size: Tuple[Any, Any, Any],
+        async_dependencies=None,
+        dynamic_shared_memory_size: Optional[Value] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if async_dependencies is None:
+            async_dependencies = []
+        async_token = None
+        if len(async_dependencies):
+            async_token = gpu_async_token()
+        grid_size_x, grid_size_y, grid_size_z = map(
+            _convert_literal_to_constant, grid_size
+        )
+        block_size_x, block_size_y, block_size_z = map(
+            _convert_literal_to_constant, block_size
+        )
+
+        super().__init__(
+            async_token,
+            async_dependencies,
+            grid_size_x,
+            grid_size_y,
+            grid_size_z,
+            block_size_x,
+            block_size_y,
+            block_size_z,
+            dynamicSharedMemorySize=dynamic_shared_memory_size,
+            loc=loc,
+            ip=ip,
+        )
+        self.regions[0].blocks.append(*[T.index() for _ in range(12)])
+
+
+def launch_(
+    grid_size: Tuple[Any, Any, Any],
+    block_size: Tuple[Any, Any, Any],
+    async_dependencies=None,
+    dynamic_shared_memory_size: Optional[Value] = None,
+    *,
+    loc=None,
+    ip=None,
+):
+    grid_size = tuple(map(_convert_literal_to_constant, grid_size))
+    block_size = tuple(map(_convert_literal_to_constant, block_size))
+    launch_op = LaunchOp(
+        grid_size,
+        block_size,
+        async_dependencies,
+        dynamic_shared_memory_size,
+        loc=loc,
+        ip=ip,
+    )
+    return launch_op
+
+
+launch = region_op(launch_, terminator=lambda *_args: terminator())
+
+
+_printf = printf
+
+
+def printf(format, *args, loc=None, ip=None):
+    return _printf(format=format, args=args, loc=loc, ip=ip)
diff --git a/mlir/test/python/dialects/gpu/dialect.py b/mlir/test/python/dialects/gpu/dialect.py
index 75855bc927ca5..3945c99c41091 100644
--- a/mlir/test/python/dialects/gpu/dialect.py
+++ b/mlir/test/python/dialects/gpu/dialect.py
@@ -183,24 +183,17 @@ def testGPULaunchFuncOp():
 
     with InsertionPoint(host.add_entry_block()):
         c1 = arith.constant(T.index(), 1)
-        grid_sizes = [c1] * 3
-        block_sizes = [c1] * 3
-        sym_ref = SymbolRefAttr.get([gpu_module.sym_name.value, gpu_func.name.value])
-        token_type = Type.parse("!gpu.async.token")
-        token = gpu.wait(async_token=token_type, async_dependencies=[])
+        grid_sizes = (1, 1, 1)
+        block_sizes = (1, 1, 1)
+        token = gpu.wait()
         token = gpu.launch_func(
-            async_token=token_type,
             async_dependencies=[token],
-            kernel=sym_ref,
-            grid_size_x=grid_sizes[0],
-            grid_size_y=grid_sizes[1],
-            grid_size_z=grid_sizes[2],
-            block_size_x=block_sizes[0],
-            block_size_y=block_sizes[1],
-            block_size_z=block_sizes[2],
+            kernel=[gpu_module.sym_name.value, gpu_func.name.value],
+            grid_size=grid_sizes,
+            block_size=block_sizes,
             kernel_operands=[],
         )
-        gpu.wait(async_token=None, async_dependencies=[token])
+        gpu.wait(async_dependencies=[token])
         func.ReturnOp([])
 
     print(module)
@@ -214,8 +207,14 @@ def testGPULaunchFuncOp():
     # CHECK-LABEL:   func.func @host() {
     # CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
     # CHECK:           %[[WAIT_0:.*]] = gpu.wait async
-    # CHECK:           %[[LAUNCH_FUNC_0:.*]] = gpu.launch_func async {{\[}}%[[WAIT_0]]] @gpu_module::@kernel blocks in (%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]]) threads in (%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]])
-    # CHECK:           gpu.wait {{\[}}%[[LAUNCH_FUNC_0]]]
+    # CHECK:           %[[CONSTANT_1:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_3:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_5:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_6:.*]] = arith.constant 1 : index
+    # CHECK:           %[[LAUNCH_FUNC_0:.*]] = gpu.launch_func async {{\[}}%[[WAIT_0]]] @gpu_module::@kernel blocks in (%[[CONSTANT_1]], %[[CONSTANT_2]], %[[CONSTANT_3]]) threads in (%[[CONSTANT_4]], %[[CONSTANT_5]], %[[CONSTANT_6]])
+    # CHECK:           %[[WAIT_1:.*]] = gpu.wait async {{\[}}%[[LAUNCH_FUNC_0]]]
     # CHECK:           return
     # CHECK:         }
 
@@ -231,15 +230,12 @@ def testGPULaunchOp():
     entry_block = host.add_entry_block()
     with InsertionPoint(entry_block):
         c1 = arith.constant(T.index(), 1)
+        grid_sizes = (c1, c1, c1)
+        block_sizes = (c1, c1, c1)
 
-        launch = gpu.launch(None, [], c1, c1, c1, c1, c1, c1)
-        launch_block = launch.regions[0].blocks.append()
-        for _ in range(12):
-            launch_block.add_argument(T.index(), Location.unknown())
+        launch = gpu.launch(grid_sizes, block_sizes)
 
-    with InsertionPoint(launch_block):
-        gpu.printf("%f", [entry_block.arguments[0]])
-        gpu.terminator()
+    op = launch(lambda *args: gpu.printf("%f", args[0]))
 
     with InsertionPoint(entry_block):
         func.ReturnOp([])
@@ -250,7 +246,7 @@ def testGPULaunchOp():
     # CHECK-SAME:      %[[ARG0:.*]]: f32) {
     # CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
     # CHECK:           gpu.launch blocks(%[[VAL_0:.*]], %[[VAL_1:.*]], %[[VAL_2:.*]]) in (%[[VAL_3:.*]] = %[[CONSTANT_0]], %[[VAL_4:.*]] = %[[CONSTANT_0]], %[[VAL_5:.*]] = %[[CONSTANT_0]]) threads(%[[VAL_6:.*]], %[[VAL_7:.*]], %[[VAL_8:.*]]) in (%[[VAL_9:.*]] = %[[CONSTANT_0]], %[[VAL_10:.*]] = %[[CONSTANT_0]], %[[VAL_11:.*]] = %[[CONSTANT_0]]) {
-    # CHECK:             gpu.printf "%[[VAL_12:.*]]", %[[ARG0]] : f32
+    # CHECK:             gpu.printf "%[[VAL_12:.*]]", %[[VAL_0]] : index
     # CHECK:             gpu.terminator
     # CHECK:           }
     # CHECK:           return

From 5f004479c9696aba8b1ec4de5826612246fb84bf Mon Sep 17 00:00:00 2001
From: Asher Mancinelli <ashermancinelli@gmail.com>
Date: Thu, 16 Oct 2025 18:50:09 -0700
Subject: [PATCH 4/4] Use legacy Union spelling

---
 mlir/python/mlir/dialects/gpu/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/python/mlir/dialects/gpu/__init__.py b/mlir/python/mlir/dialects/gpu/__init__.py
index 18ae52736dd2e..2fbcbb059f87a 100644
--- a/mlir/python/mlir/dialects/gpu/__init__.py
+++ b/mlir/python/mlir/dialects/gpu/__init__.py
@@ -162,7 +162,7 @@ def arguments(self) -> Sequence[Type]:
         return self.function_type.value.inputs
 
 
-def _convert_literal_to_constant(value: int | ConstantOp) -> Value:
+def _convert_literal_to_constant(value: Union[int, ConstantOp, Value]) -> Value:
     if isinstance(value, int):
         return constant(T.index(), value)
     elif isinstance(value, (ConstantOp, Value)):