diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
index f61e40ef5f969..4eb6379adf6e7 100644
--- a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
+++ b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
@@ -19,17 +19,12 @@ namespace mlir {
 class Location;
 class ModuleOp;
 
-template <typename T>
-class OpPassBase;
-
-namespace gpu {
-class GPUModuleOp;
-} // namespace gpu
-
 namespace LLVM {
 class LLVMDialect;
 } // namespace LLVM
 
+template <typename T> class OpPassBase;
+
 using OwnedCubin = std::unique_ptr<std::vector<char>>;
 using CubinGenerator =
     std::function<OwnedCubin(const std::string &, Location, StringRef)>;
@@ -43,7 +38,7 @@ using CubinGenerator =
 /// attached as a string attribute named 'nvvm.cubin' to the kernel function.
 /// After the transformation, the body of the kernel function is removed (i.e.,
 /// it is turned into a declaration).
-std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
+std::unique_ptr<OpPassBase<ModuleOp>>
 createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);
 
 /// Creates a pass to convert a gpu.launch_func operation into a sequence of
diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
index b3212279fab98..75e4f7e374c6a 100644
--- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
+++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
@@ -14,19 +14,15 @@ namespace mlir {
 class LLVMTypeConverter;
 class OwningRewritePatternList;
 
-template <typename OpT>
-class OpPassBase;
-
-namespace gpu {
-class GPUModuleOp;
-}
+class ModuleOp;
+template <typename OpT> class OpPassBase;
 
 /// Collect a set of patterns to convert from the GPU dialect to NVVM.
 void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
                                          OwningRewritePatternList &patterns);
 
 /// Creates a pass that lowers GPU dialect operations to NVVM counterparts.
-std::unique_ptr<OpPassBase<gpu::GPUModuleOp>> createLowerGpuOpsToNVVMOpsPass();
+std::unique_ptr<OpPassBase<ModuleOp>> createLowerGpuOpsToNVVMOpsPass();
 
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index 3df6ff4be0c1d..766ddbf202c25 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -588,56 +588,4 @@ def GPU_BarrierOp : GPU_Op<"barrier"> {
   let printer = [{ p << getOperationName(); }];
 }
 
-def GPU_GPUModuleOp : GPU_Op<"module", [
-  IsolatedFromAbove, SymbolTable, Symbol,
-  SingleBlockImplicitTerminator<"ModuleEndOp">
-]> {
-  let summary = "A top level compilation unit containing code to be run on a GPU.";
-  let description = [{
-    GPU module contains code that is intended to be run on a GPU. A host device
-    can launch this code through a gpu.launc_func that creates a fully
-    qualified symbol through the gpu.module's symbol and a gpu.func symbol
-    contained in the gpu.module.
-
-    The module's top-level scope is modeled by a single region with a single
-    block. GPU modules are required to have a name that is used for symbol
-    resolution by the gpu.launch_func operation.
-
-    Using an op with a region to define a GPU module enables "embedding" GPU
-    modules with SIMT execution models in other dialects in a clean manner and
-    allows filtering of code regions to execute passes on only code intended to
-    or not intended to be run on the separate device.
-
-    ```
-      gpu.module @symbol_name {
-      gpu.func {}
-        ...
-      gpu.module_end
-    }
-
-    ```
-  }];
-  let builders = [OpBuilder<"Builder *builder, OperationState &result, "
-                            "StringRef name">];
-  let parser = [{ return ::parseGPUModuleOp(parser, result); }];
-  let printer = [{ return ::print(p, *this); }];
-  let regions = (region SizedRegion<1>:$body);
-
-  // We need to ensure the block inside the region is properly terminated;
-  // the auto-generated builders do not guarantee that.
-  let skipDefaultBuilders = 1;
-}
-
-def GPU_ModuleEndOp : GPU_Op<"module_end", [
-  Terminator, HasParent<"GPUModuleOp">
-]> {
-  let summary = "A pseudo op that marks the end of a gpu.module.";
-  let description = [{
-    This op terminates the only block inside the only region of a `gpu.module`.
-  }];
-
-  let parser = [{ return success(); }];
-  let printer = [{ p << getOperationName(); }];
-}
-
 #endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
index b111c96313c2c..66a2e66f99a4e 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
@@ -46,15 +46,18 @@ static constexpr const char *kCubinAnnotation = "nvvm.cubin";
 /// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
 /// GPU binary code, which is then attached as an attribute to the function. The
 /// function body is erased.
-class GpuKernelToCubinPass
-    : public OperationPass<GpuKernelToCubinPass, gpu::GPUModuleOp> {
+class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
 public:
   GpuKernelToCubinPass(
       CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
       : cubinGenerator(cubinGenerator) {}
 
-  void runOnOperation() override {
-    gpu::GPUModuleOp module = getOperation();
+  void runOnModule() override {
+    ModuleOp module = getModule();
+    if (!module.getAttrOfType<UnitAttr>(
+            gpu::GPUDialect::getKernelModuleAttrName()) ||
+        !module.getName())
+      return;
 
     // Make sure the NVPTX target is initialized.
     LLVMInitializeNVPTXTarget();
@@ -68,8 +71,8 @@ class GpuKernelToCubinPass
 
     // Translate the module to CUBIN and attach the result as attribute to the
     // module.
-    if (auto cubinAttr = translateGPUModuleToCubinAnnotation(
-            *llvmModule, module.getLoc(), module.getName()))
+    if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
+            *llvmModule, module.getLoc(), *module.getName()))
       module.setAttr(kCubinAnnotation, cubinAttr);
     else
       signalPassFailure();
@@ -89,7 +92,7 @@ class GpuKernelToCubinPass
                                   StringRef name);
 
   /// Translates llvmModule to cubin and returns the result as attribute.
-  StringAttr translateGPUModuleToCubinAnnotation(llvm::Module &llvmModule,
+  StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
                                                  Location loc, StringRef name);
 
   CubinGenerator cubinGenerator;
@@ -146,7 +149,7 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
   return cubinGenerator(ptx, loc, name);
 }
 
-StringAttr GpuKernelToCubinPass::translateGPUModuleToCubinAnnotation(
+StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
     llvm::Module &llvmModule, Location loc, StringRef name) {
   auto cubin = convertModuleToCubin(llvmModule, loc, name);
   if (!cubin)
@@ -154,7 +157,7 @@ StringAttr GpuKernelToCubinPass::translateGPUModuleToCubinAnnotation(
   return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
 }
 
-std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
+std::unique_ptr<OpPassBase<ModuleOp>>
 mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
   return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
 }
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
index 31024d2881b5c..41f69d6e21d1b 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -132,9 +132,9 @@ class GpuLaunchFuncToCudaCallsPass
 
     // GPU kernel modules are no longer necessary since we have a global
     // constant with the CUBIN data.
-    for (auto m :
-         llvm::make_early_inc_range(getModule().getOps<gpu::GPUModuleOp>()))
-      m.erase();
+    for (auto m : llvm::make_early_inc_range(getModule().getOps<ModuleOp>()))
+      if (m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
+        m.erase();
   }
 
 private:
@@ -343,8 +343,8 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
                                                builder.getI32IntegerAttr(0));
   // Create an LLVM global with CUBIN extracted from the kernel annotation and
   // obtain a pointer to the first byte in it.
-  auto kernelModule = getModule().lookupSymbol<gpu::GPUModuleOp>(
-      launchOp.getKernelModuleName());
+  auto kernelModule =
+      getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName());
   assert(kernelModule && "expected a kernel module");
 
   auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);
@@ -354,7 +354,8 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
     return signalPassFailure();
   }
 
-  SmallString<128> nameBuffer(kernelModule.getName());
+  assert(kernelModule.getName() && "expected a named module");
+  SmallString<128> nameBuffer(*kernelModule.getName());
   nameBuffer.append(kCubinStorageSuffix);
   Value data = LLVM::createGlobalString(
       loc, builder, nameBuffer.str(), cubinAttr.getValue(),
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 84bc7ff1d5f52..e2b1e0e533c53 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -200,7 +200,7 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering {
     auto type = operand.getType().cast<LLVM::LLVMType>();
 
     // Create shared memory array to store the warp reduction.
-    auto module = operand.getDefiningOp()->getParentOfType<gpu::GPUModuleOp>();
+    auto module = operand.getDefiningOp()->getParentOfType<ModuleOp>();
     assert(module && "op must belong to a module");
     Value sharedMemPtr =
         createSharedMemoryArray(loc, module, type, kWarpSize, rewriter);
@@ -391,10 +391,10 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering {
   }
 
   /// Creates a global array stored in shared memory.
-  Value createSharedMemoryArray(Location loc, gpu::GPUModuleOp module,
+  Value createSharedMemoryArray(Location loc, ModuleOp module,
                                 LLVM::LLVMType elementType, int numElements,
                                 ConversionPatternRewriter &rewriter) const {
-    OpBuilder builder(module.body());
+    OpBuilder builder(module.getBodyRegion());
 
     auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
     StringRef name = "reduce_buffer";
@@ -699,11 +699,13 @@ struct GPUReturnOpLowering : public LLVMOpLowering {
 ///
 /// This pass only handles device code and is not meant to be run on GPU host
 /// code.
-class LowerGpuOpsToNVVMOpsPass
-    : public OperationPass<LowerGpuOpsToNVVMOpsPass, gpu::GPUModuleOp> {
+class LowerGpuOpsToNVVMOpsPass : public ModulePass<LowerGpuOpsToNVVMOpsPass> {
 public:
-  void runOnOperation() override {
-    gpu::GPUModuleOp m = getOperation();
+  void runOnModule() override {
+    ModuleOp m = getModule();
+    if (!m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
+      return;
+
     OwningRewritePatternList patterns;
     NVVMTypeConverter converter(m.getContext());
     populateStdToLLVMConversionPatterns(converter, patterns);
@@ -716,7 +718,7 @@ class LowerGpuOpsToNVVMOpsPass
     target.addLegalDialect<LLVM::LLVMDialect>();
     target.addLegalDialect<NVVM::NVVMDialect>();
     // TODO(csigg): Remove once we support replacing non-root ops.
-    target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
+    target.addLegalOp<gpu::YieldOp>();
     if (failed(applyPartialConversion(m, target, patterns, &converter)))
       signalPassFailure();
   }
@@ -748,8 +750,7 @@ void mlir::populateGpuToNVVMConversionPatterns(
                                                "__nv_exp");
 }
 
-std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
-mlir::createLowerGpuOpsToNVVMOpsPass() {
+std::unique_ptr<OpPassBase<ModuleOp>> mlir::createLowerGpuOpsToNVVMOpsPass() {
   return std::make_unique<LowerGpuOpsToNVVMOpsPass>();
 }
 
diff --git a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
index adeb4e099ab9e..be82894461d61 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
@@ -1,15 +1,8 @@
-set(LLVM_TARGET_DEFINITIONS GPUToSPIRV.td)
-mlir_tablegen(GPUToSPIRV.cpp.inc -gen-rewriters)
-add_public_tablegen_target(MLIRGPUToSPIRVIncGen)
-
 add_llvm_library(MLIRGPUtoSPIRVTransforms
   ConvertGPUToSPIRV.cpp
   ConvertGPUToSPIRVPass.cpp
   )
 
-add_dependencies(MLIRGPUtoSPIRVTransforms
-  MLIRGPUToSPIRVIncGen)
-
 target_link_libraries(MLIRGPUtoSPIRVTransforms
   MLIRGPU
   MLIRIR
diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp
index a90cea99be492..2fd8cedfd63b5 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp
@@ -63,13 +63,27 @@ class KernelFnConversion final : public SPIRVOpLowering<gpu::GPUFuncOp> {
   SmallVector<int32_t, 3> workGroupSizeAsInt32;
 };
 
-/// Pattern to convert a gpu.module to a spv.module.
-class GPUModuleConversion final : public SPIRVOpLowering<gpu::GPUModuleOp> {
+/// Pattern to convert a module with gpu.kernel_module attribute to a
+/// spv.module.
+class KernelModuleConversion final : public SPIRVOpLowering<ModuleOp> {
 public:
-  using SPIRVOpLowering<gpu::GPUModuleOp>::SPIRVOpLowering;
+  using SPIRVOpLowering<ModuleOp>::SPIRVOpLowering;
 
   PatternMatchResult
-  matchAndRewrite(gpu::GPUModuleOp moduleOp, ArrayRef<Value> operands,
+  matchAndRewrite(ModuleOp moduleOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// Pattern to convert a module terminator op to a terminator of spv.module op.
+// TODO: Move this into DRR, but that requires ModuleTerminatorOp to be defined
+// in ODS.
+class KernelModuleTerminatorConversion final
+    : public SPIRVOpLowering<ModuleTerminatorOp> {
+public:
+  using SPIRVOpLowering<ModuleTerminatorOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(ModuleTerminatorOp terminatorOp, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const override;
 };
 
@@ -270,12 +284,16 @@ KernelFnConversion::matchAndRewrite(gpu::GPUFuncOp funcOp,
 }
 
 //===----------------------------------------------------------------------===//
-// ModuleOp with gpu.module.
+// ModuleOp with gpu.kernel_module.
 //===----------------------------------------------------------------------===//
 
-PatternMatchResult GPUModuleConversion::matchAndRewrite(
-    gpu::GPUModuleOp moduleOp, ArrayRef<Value> operands,
+PatternMatchResult KernelModuleConversion::matchAndRewrite(
+    ModuleOp moduleOp, ArrayRef<Value> operands,
     ConversionPatternRewriter &rewriter) const {
+  if (!moduleOp.getAttrOfType<UnitAttr>(
+          gpu::GPUDialect::getKernelModuleAttrName())) {
+    return matchFailure();
+  }
   // TODO : Generalize this to account for different extensions,
   // capabilities, extended_instruction_sets, other addressing models
   // and memory models.
@@ -284,8 +302,8 @@ PatternMatchResult GPUModuleConversion::matchAndRewrite(
       spirv::MemoryModel::GLSL450, spirv::Capability::Shader,
       spirv::Extension::SPV_KHR_storage_buffer_storage_class);
   // Move the region from the module op into the SPIR-V module.
-  Region &spvModuleRegion = spvModule.body();
-  rewriter.inlineRegionBefore(moduleOp.body(), spvModuleRegion,
+  Region &spvModuleRegion = spvModule.getOperation()->getRegion(0);
+  rewriter.inlineRegionBefore(moduleOp.getBodyRegion(), spvModuleRegion,
                               spvModuleRegion.begin());
   // The spv.module build method adds a block with a terminator. Remove that
   // block. The terminator of the module op in the remaining block will be
@@ -295,6 +313,17 @@ PatternMatchResult GPUModuleConversion::matchAndRewrite(
   return matchSuccess();
 }
 
+//===----------------------------------------------------------------------===//
+// ModuleTerminatorOp for gpu.kernel_module.
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult KernelModuleTerminatorConversion::matchAndRewrite(
+    ModuleTerminatorOp terminatorOp, ArrayRef<Value> operands,
+    ConversionPatternRewriter &rewriter) const {
+  rewriter.replaceOpWithNewOp<spirv::ModuleEndOp>(terminatorOp);
+  return matchSuccess();
+}
+
 //===----------------------------------------------------------------------===//
 // GPU return inside kernel functions to SPIR-V return.
 //===----------------------------------------------------------------------===//
@@ -313,18 +342,14 @@ PatternMatchResult GPUReturnOpConversion::matchAndRewrite(
 // GPU To SPIRV Patterns.
 //===----------------------------------------------------------------------===//
 
-namespace {
-#include "GPUToSPIRV.cpp.inc"
-}
-
 void mlir::populateGPUToSPIRVPatterns(MLIRContext *context,
                                       SPIRVTypeConverter &typeConverter,
                                       OwningRewritePatternList &patterns,
                                       ArrayRef<int64_t> workGroupSize) {
-  populateWithGenerated(context, &patterns);
   patterns.insert<KernelFnConversion>(context, typeConverter, workGroupSize);
   patterns.insert<
-      GPUReturnOpConversion, ForOpConversion, GPUModuleConversion,
+      GPUReturnOpConversion, ForOpConversion, KernelModuleConversion,
+      KernelModuleTerminatorConversion,
       LaunchConfigConversion<gpu::BlockDimOp, spirv::BuiltIn::WorkgroupSize>,
       LaunchConfigConversion<gpu::BlockIdOp, spirv::BuiltIn::WorkgroupId>,
       LaunchConfigConversion<gpu::GridDimOp, spirv::BuiltIn::NumWorkgroups>,
diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp
index 4dda8bdc2b39f..9b758d1052c76 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp
@@ -60,12 +60,15 @@ void GPUToSPIRVPass::runOnModule() {
 
   SmallVector<Operation *, 1> kernelModules;
   OpBuilder builder(context);
-  module.walk([&builder, &kernelModules](gpu::GPUModuleOp moduleOp) {
-    // For each kernel module (should be only 1 for now, but that is not a
-    // requirement here), clone the module for conversion because the
-    // gpu.launch function still needs the kernel module.
-    builder.setInsertionPoint(moduleOp.getOperation());
-    kernelModules.push_back(builder.clone(*moduleOp.getOperation()));
+  module.walk([&builder, &kernelModules](ModuleOp moduleOp) {
+    if (moduleOp.getAttrOfType<UnitAttr>(
+            gpu::GPUDialect::getKernelModuleAttrName())) {
+      // For each kernel module (should be only 1 for now, but that is not a
+      // requirement here), clone the module for conversion because the
+      // gpu.launch function still needs the kernel module.
+      builder.setInsertionPoint(moduleOp.getOperation());
+      kernelModules.push_back(builder.clone(*moduleOp.getOperation()));
+    }
   });
 
   SPIRVTypeConverter typeConverter;
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.td b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.td
deleted file mode 100644
index cfe9d26273cc4..0000000000000
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.td
+++ /dev/null
@@ -1,22 +0,0 @@
-//===-- GPUToSPIRV.td - GPU to SPIR-V Dialect Lowerings ----*- tablegen -*-===//
-//
-// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains patterns to lower GPU dialect ops to to SPIR-V ops.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef CONVERT_GPU_TO_SPIRV
-#define CONVERT_GPU_TO_SPIRV
-
-include "mlir/Dialect/GPU/GPUOps.td"
-include "mlir/Dialect/SPIRV/SPIRVStructureOps.td"
-
-def : Pat<(GPU_ModuleEndOp), (SPV_ModuleEndOp)>;
-
-#endif // CONVERT_GPU_TO_SPIRV
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 083e282602015..e5bcc69734474 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -72,10 +72,15 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
 
     // Check that `launch_func` refers to a well-formed GPU kernel module.
     StringRef kernelModuleName = launchOp.getKernelModuleName();
-    auto kernelModule = module.lookupSymbol<GPUModuleOp>(kernelModuleName);
+    auto kernelModule = module.lookupSymbol<ModuleOp>(kernelModuleName);
     if (!kernelModule)
       return launchOp.emitOpError()
              << "kernel module '" << kernelModuleName << "' is undefined";
+    if (!kernelModule.getAttrOfType<UnitAttr>(
+            GPUDialect::getKernelModuleAttrName()))
+      return launchOp.emitOpError("module '")
+             << kernelModuleName << "' is missing the '"
+             << GPUDialect::getKernelModuleAttrName() << "' attribute";
 
     // Check that `launch_func` refers to a well-formed kernel function.
     StringRef kernelName = launchOp.kernel();
@@ -512,9 +517,10 @@ void LaunchFuncOp::build(Builder *builder, OperationState &result,
   result.addOperands(kernelOperands);
   result.addAttribute(getKernelAttrName(),
                       builder->getStringAttr(kernelFunc.getName()));
-  auto kernelModule = kernelFunc.getParentOfType<GPUModuleOp>();
-  result.addAttribute(getKernelModuleAttrName(),
-                      builder->getSymbolRefAttr(kernelModule.getName()));
+  auto kernelModule = kernelFunc.getParentOfType<ModuleOp>();
+  if (Optional<StringRef> kernelModuleName = kernelModule.getName())
+    result.addAttribute(getKernelModuleAttrName(),
+                        builder->getSymbolRefAttr(*kernelModuleName));
 }
 
 void LaunchFuncOp::build(Builder *builder, OperationState &result,
@@ -814,47 +820,6 @@ LogicalResult GPUFuncOp::verifyBody() {
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// GPUModuleOp
-//===----------------------------------------------------------------------===//
-
-void GPUModuleOp::build(Builder *builder, OperationState &result,
-                        StringRef name) {
-  ensureTerminator(*result.addRegion(), *builder, result.location);
-  result.attributes.push_back(builder->getNamedAttr(
-      ::mlir::SymbolTable::getSymbolAttrName(), builder->getStringAttr(name)));
-}
-
-static ParseResult parseGPUModuleOp(OpAsmParser &parser,
-                                    OperationState &result) {
-  StringAttr nameAttr;
-  if (parser.parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(),
-                             result.attributes))
-    return failure();
-
-  // If module attributes are present, parse them.
-  if (parser.parseOptionalAttrDictWithKeyword(result.attributes))
-    return failure();
-
-  // Parse the module body.
-  auto *body = result.addRegion();
-  if (parser.parseRegion(*body, None, None))
-    return failure();
-
-  // Ensure that this module has a valid terminator.
-  GPUModuleOp::ensureTerminator(*body, parser.getBuilder(), result.location);
-  return success();
-}
-
-static void print(OpAsmPrinter &p, GPUModuleOp op) {
-  p << op.getOperationName() << ' ';
-  p.printSymbolName(op.getName());
-  p.printOptionalAttrDictWithKeyword(op.getAttrs(),
-                                     {SymbolTable::getSymbolAttrName()});
-  p.printRegion(op.getOperation()->getRegion(0), /*printEntryBlockArgs=*/false,
-                /*printBlockTerminators=*/false);
-}
-
 // Namespace avoids ambiguous ReturnOpOperandAdaptor.
 namespace mlir {
 namespace gpu {
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 0f8e2253980c2..37f9c2e7b843f 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -140,8 +140,8 @@ namespace {
 /// inside a nested module. It also creates an external function of the same
 /// name in the parent module.
 ///
-/// The gpu.modules are intended to be compiled to a cubin blob independently in
-/// a separate pass. The external functions can then be annotated with the
+/// The kernel modules are intended to be compiled to a cubin blob independently
+/// in a separate pass. The external functions can then be annotated with the
 /// symbol of the cubin accessor function.
 class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> {
 public:
@@ -174,19 +174,15 @@ class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> {
   }
 
 private:
-  // Returns a gpu.module containing kernelFunc and all callees (recursive).
-  gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
-                                      const SymbolTable &parentSymbolTable) {
-    // TODO: This code cannot use an OpBuilder because it must be inserted into
-    // a SymbolTable by the caller. SymbolTable needs to be refactored to
-    // prevent manual building of Ops with symbols in code using SymbolTables
-    // and then this needs to use the OpBuilder.
+  // Returns a module containing kernelFunc and all callees (recursive).
+  ModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
+                              const SymbolTable &parentSymbolTable) {
     auto context = getModule().getContext();
     Builder builder(context);
-    OperationState state(kernelFunc.getLoc(),
-                         gpu::GPUModuleOp::getOperationName());
-    gpu::GPUModuleOp::build(&builder, state, kernelFunc.getName());
-    auto kernelModule = cast<gpu::GPUModuleOp>(Operation::create(state));
+    auto kernelModule =
+        ModuleOp::create(builder.getUnknownLoc(), kernelFunc.getName());
+    kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(),
+                         builder.getUnitAttr());
     SymbolTable symbolTable(kernelModule);
     symbolTable.insert(kernelFunc);
 
diff --git a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
index 707f4a0639587..6865462595f74 100644
--- a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
@@ -5,7 +5,7 @@ module attributes {gpu.container_module} {
   // CHECK: llvm.mlir.global internal constant @[[kernel_name:.*]]("kernel\00")
   // CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN")
 
-  gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN"} {
+  module @kernel_module attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"} {
     gpu.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} {
       gpu.return
     }
diff --git a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir
index 78b9f56b6202c..62fe2b993388b 100644
--- a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s --test-kernel-to-cubin -split-input-file | FileCheck %s
 
-// CHECK: attributes {nvvm.cubin = "CUBIN"}
-gpu.module @foo {
+// CHECK: attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"}
+module @foo attributes {gpu.kernel_module} {
   llvm.func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
     // CHECK: attributes  {gpu.kernel}
     attributes  { gpu.kernel } {
@@ -11,7 +11,7 @@ gpu.module @foo {
 
 // -----
 
-gpu.module @bar {
+module @bar attributes {gpu.kernel_module} {
   // CHECK: func @kernel_a
   llvm.func @kernel_a()
     attributes  { gpu.kernel } {
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 7f69cb7482c54..24bf56557c3e2 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file | FileCheck %s
 
-gpu.module @test_module {
+module attributes {gpu.kernel_module} {
   // CHECK-LABEL: func @gpu_index_ops()
   func @gpu_index_ops()
       attributes { gpu.kernel } {
@@ -38,7 +38,7 @@ gpu.module @test_module {
 
 // -----
 
-gpu.module @test_module {
+module attributes {gpu.kernel_module} {
   // CHECK-LABEL: func @gpu_all_reduce_op()
   func @gpu_all_reduce_op()
       attributes { gpu.kernel } {
@@ -55,7 +55,7 @@ gpu.module @test_module {
 
 // -----
 
-gpu.module @test_module {
+module attributes {gpu.kernel_module} {
   // CHECK-LABEL: func @gpu_all_reduce_region()
   func @gpu_all_reduce_region()
       attributes { gpu.kernel } {
@@ -74,7 +74,7 @@ gpu.module @test_module {
 
 // -----
 
-gpu.module @test_module {
+module attributes {gpu.kernel_module} {
   // CHECK-LABEL: func @gpu_shuffle()
   func @gpu_shuffle()
       attributes { gpu.kernel } {
@@ -99,7 +99,7 @@ gpu.module @test_module {
 
 // -----
 
-gpu.module @test_module {
+module attributes {gpu.kernel_module} {
   // CHECK-LABEL: func @gpu_sync()
   func @gpu_sync()
       attributes { gpu.kernel } {
@@ -111,7 +111,7 @@ gpu.module @test_module {
 
 // -----
 
-gpu.module @test_module {
+module attributes {gpu.kernel_module} {
   // CHECK: llvm.func @__nv_fabsf(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_fabs(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_fabs
@@ -126,7 +126,7 @@ gpu.module @test_module {
 
 // -----
 
-gpu.module @test_module {
+module attributes {gpu.kernel_module} {
   // CHECK: llvm.func @__nv_ceilf(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_ceil(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_ceil
@@ -141,7 +141,7 @@ gpu.module @test_module {
 
 // -----
 
-gpu.module @test_module {
+module attributes {gpu.kernel_module} {
   // CHECK: llvm.func @__nv_cosf(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_cos(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_cos
@@ -156,7 +156,7 @@ gpu.module @test_module {
 
 // -----
 
-gpu.module @test_module {
+module attributes {gpu.kernel_module} {
   // CHECK: llvm.func @__nv_expf(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_exp(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_exp
@@ -174,7 +174,7 @@ gpu.module @test_module {
 // -----
 
 // Test that we handled properly operation with SymbolTable other than module op
-gpu.module @test_module {
+module attributes {gpu.kernel_module} {
   "test.symbol_scope"() ({
   // CHECK: test.symbol_scope
   // CHECK: llvm.func @__nv_expf(!llvm.float) -> !llvm.float
diff --git a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir b/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir
index 115c71d128004..69a16b25139e4 100644
--- a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s
 
-gpu.module @kernel {
+module attributes {gpu.kernel_module} {
   // CHECK-LABEL:  llvm.func @private
   gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) {
     // Allocate private memory inside the function.
@@ -32,7 +32,7 @@ gpu.module @kernel {
 
 // -----
 
-gpu.module @kernel {
+module attributes {gpu.kernel_module} {
   // Workgroup buffers are allocated as globals.
   // CHECK: llvm.mlir.global internal @[[buffer:.*]]()
   // CHECK-SAME:  addr_space = 3
@@ -72,7 +72,7 @@ gpu.module @kernel {
 
 // -----
 
-gpu.module @kernel {
+module attributes {gpu.kernel_module} {
   // Check that the total size was computed correctly.
   // CHECK: llvm.mlir.global internal @[[buffer:.*]]()
   // CHECK-SAME:  addr_space = 3
@@ -113,7 +113,7 @@ gpu.module @kernel {
 
 // -----
 
-gpu.module @kernel {
+module attributes {gpu.kernel_module} {
   // Check that several buffers are defined.
   // CHECK: llvm.mlir.global internal @[[buffer1:.*]]()
   // CHECK-SAME:  !llvm<"[1 x float]">
diff --git a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
index 7f4081e4eda0c..c0a68a9db2af4 100644
--- a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
@@ -9,7 +9,7 @@ module attributes {gpu.container_module} {
 
   // CHECK-LABEL:  spv.module "Logical" "GLSL450"
   // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
-  gpu.module @kernels {
+  module @kernels attributes {gpu.kernel_module} {
     gpu.func @builtin_workgroup_id_x()
       attributes {gpu.kernel} {
       // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
@@ -32,7 +32,7 @@ module attributes {gpu.container_module} {
 
   // CHECK-LABEL:  spv.module "Logical" "GLSL450"
   // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
-  gpu.module @kernels {
+  module @kernels attributes {gpu.kernel_module} {
     gpu.func @builtin_workgroup_id_y()
       attributes {gpu.kernel} {
       // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
@@ -55,7 +55,7 @@ module attributes {gpu.container_module} {
 
   // CHECK-LABEL:  spv.module "Logical" "GLSL450"
   // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
-  gpu.module @kernels {
+  module @kernels attributes {gpu.kernel_module} {
     gpu.func @builtin_workgroup_id_z()
       attributes {gpu.kernel} {
       // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
@@ -78,7 +78,7 @@ module attributes {gpu.container_module} {
 
   // CHECK-LABEL:  spv.module "Logical" "GLSL450"
   // CHECK: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize")
-  gpu.module @kernels {
+  module @kernels attributes {gpu.kernel_module} {
     gpu.func @builtin_workgroup_size_x()
       attributes {gpu.kernel} {
       // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPSIZE]]
@@ -101,7 +101,7 @@ module attributes {gpu.container_module} {
 
   // CHECK-LABEL:  spv.module "Logical" "GLSL450"
   // CHECK: spv.globalVariable [[LOCALINVOCATIONID:@.*]] built_in("LocalInvocationId")
-  gpu.module @kernels {
+  module @kernels attributes {gpu.kernel_module} {
     gpu.func @builtin_local_id_x()
       attributes {gpu.kernel} {
       // CHECK: [[ADDRESS:%.*]] = spv._address_of [[LOCALINVOCATIONID]]
@@ -124,7 +124,7 @@ module attributes {gpu.container_module} {
 
   // CHECK-LABEL:  spv.module "Logical" "GLSL450"
   // CHECK: spv.globalVariable [[NUMWORKGROUPS:@.*]] built_in("NumWorkgroups")
-  gpu.module @kernels {
+  module @kernels attributes {gpu.kernel_module} {
     gpu.func @builtin_num_workgroups_x()
       attributes {gpu.kernel} {
       // CHECK: [[ADDRESS:%.*]] = spv._address_of [[NUMWORKGROUPS]]
diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
index 446c0d602ed3c..d104c96cfa9d9 100644
--- a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
@@ -16,7 +16,7 @@ module attributes {gpu.container_module} {
   }
 
   // CHECK-LABEL: spv.module "Logical" "GLSL450"
-  gpu.module @kernels {
+  module @kernels attributes {gpu.kernel_module} {
     // CHECK-DAG: spv.globalVariable [[WORKGROUPSIZEVAR:@.*]] built_in("WorkgroupSize") : !spv.ptr<vector<3xi32>, Input>
     // CHECK-DAG: spv.globalVariable [[NUMWORKGROUPSVAR:@.*]] built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
     // CHECK-DAG: spv.globalVariable [[LOCALINVOCATIONIDVAR:@.*]] built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
diff --git a/mlir/test/Conversion/GPUToSPIRV/loop.mlir b/mlir/test/Conversion/GPUToSPIRV/loop.mlir
index bd97315a2ea4c..6d38360b7e84a 100644
--- a/mlir/test/Conversion/GPUToSPIRV/loop.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/loop.mlir
@@ -7,7 +7,7 @@ module attributes {gpu.container_module} {
     return
   }
 
-  gpu.module @kernels {
+  module @kernels attributes {gpu.kernel_module} {
     gpu.func @loop_kernel(%arg2 : memref<10xf32>, %arg3 : memref<10xf32>)
     attributes {gpu.kernel} {
       // CHECK: [[LB:%.*]] = spv.constant 4 : i32
diff --git a/mlir/test/Conversion/GPUToSPIRV/simple.mlir b/mlir/test/Conversion/GPUToSPIRV/simple.mlir
index cca5eb9d0b492..e1b687c1a0b0d 100644
--- a/mlir/test/Conversion/GPUToSPIRV/simple.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/simple.mlir
@@ -2,7 +2,7 @@
 
 module attributes {gpu.container_module} {
 
-  gpu.module @kernels {
+  module @kernels attributes {gpu.kernel_module} {
     // CHECK:       spv.module "Logical" "GLSL450" {
     // CHECK-LABEL: func @kernel_1
     // CHECK-SAME: {{%.*}}: f32 {spv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index 8f900bf6b5cef..8323fdf8709e2 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -167,7 +167,7 @@ module attributes {gpu.container_module} {
   }
 
   func @launch_func_missing_module_attribute(%sz : index) {
-    // expected-error@+1 {{kernel module 'kernels' is undefined}}
+    // expected-error@+1 {{module 'kernels' is missing the 'gpu.kernel_module' attribute}}
     "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
     { kernel = "kernel_1", kernel_module = @kernels }
         : (index, index, index, index, index, index) -> ()
@@ -178,7 +178,8 @@ module attributes {gpu.container_module} {
 // -----
 
 module attributes {gpu.container_module} {
-  gpu.module @kernels { }
+  module @kernels attributes {gpu.kernel_module} {
+  }
 
   func @launch_func_undefined_function(%sz : index) {
     // expected-error@+1 {{kernel function 'kernel_1' is undefined}}
@@ -192,7 +193,7 @@ module attributes {gpu.container_module} {
 // -----
 
 module attributes {gpu.container_module} {
-  gpu.module @kernels {
+  module @kernels attributes {gpu.kernel_module} {
     gpu.func @kernel_1(%arg1 : !llvm<"float*">) kernel {
       gpu.return
     }
@@ -210,7 +211,7 @@ module attributes {gpu.container_module} {
 // -----
 
 module attributes {gpu.container_module} {
-  gpu.module @kernels {
+  module @kernels attributes {gpu.kernel_module} {
     gpu.func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
       gpu.return
     }
@@ -228,7 +229,7 @@ module attributes {gpu.container_module} {
 
 // -----
 
-gpu.module @kernels {
+module @kernels attributes {gpu.kernel_module} {
   gpu.func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
     gpu.return
   }
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 033e7cbcb7e11..1dd08cea492a0 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -60,7 +60,7 @@ module attributes {gpu.container_module} {
     return
   }
 
-  gpu.module @kernels {
+  module @kernels attributes {gpu.kernel_module} {
     gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) attributes {gpu.kernel} {
       %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
       %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 425b4b3090c5f..5adb881a1dc6a 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -136,7 +136,7 @@ func @recursive_device_function() {
   gpu.return
 }
 
-// CHECK: gpu.module @function_call_kernel {
+// CHECK: module @function_call_kernel attributes {gpu.kernel_module} {
 // CHECK:   gpu.func @function_call_kernel()
 // CHECK:     call @device_function() : () -> ()
 // CHECK:     call @device_function() : () -> ()
diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
index a05016f48e86e..d6160d6d6e0c8 100644
--- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
+++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -105,7 +105,7 @@ static LogicalResult runMLIRPasses(ModuleOp m) {
   applyPassManagerCLOptions(pm);
 
   pm.addPass(createGpuKernelOutliningPass());
-  auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
+  auto &kernelPm = pm.nest<ModuleOp>();
   kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
   kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
   pm.addPass(createLowerToLLVMPass());