diff --git a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
index f7ca6dd7624bb..822e60da7c20d 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
@@ -33,6 +33,8 @@ add_mlir_dialect(NVVMOps nvvm)
 add_mlir_doc(NVVMOps NVVMDialect Dialects/ -gen-dialect-doc)
 set(LLVM_TARGET_DEFINITIONS NVVMOps.td)
 mlir_tablegen(NVVMConversions.inc -gen-llvmir-conversions)
+mlir_tablegen(NVVMOpsEnums.h.inc -gen-enum-decls)
+mlir_tablegen(NVVMOpsEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(MLIRNVVMConversionsIncGen)
 
 add_mlir_dialect(ROCDLOps rocdl)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
index fff82e3b9f4f4..4c5d786f119a8 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -18,6 +18,16 @@
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+
+#include "mlir/Dialect/LLVMIR/NVVMOpsEnums.h.inc"
+
+/// Return the element type and number of elements associated with a wmma matrix
+/// of given chracteristics. This matches the logic in IntrinsicsNVVM.td
+/// WMMA_REGS structure.
+std::pair<mlir::Type, unsigned> inferMMAType(mlir::NVVM::MMATypes type,
+                                             mlir::NVVM::MMAFrag frag,
+                                             mlir::MLIRContext *context);
 
 ///// Ops /////
 #define GET_OP_CLASSES
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index a3f5a84dad59f..b7aa50198d86e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -151,124 +151,355 @@ def NVVM_MmaOp :
   let verifier = [{ return ::verify(*this); }];
 }
 
-// Base class for all the variants of WMMA loadOps that may be defined.
-class NVVM_WMMALoadOp<string mnemonic> : NVVM_Op<mnemonic>,
-  Results<(outs LLVM_AnyStruct:$res)>,
-  Arguments<(ins Variadic<LLVM_Type>:$args)> {
+/// Helpers to instantiate different version of wmma intrinsics.
+/// This matches the hierarchy used in IntrinsicsNVVM.td to define all the
+/// combinations of the intrinsics.
+class GEOM<int M, int N, int K> {
+  int m = M;
+  int n = N;
+  int k = K;
+}
 
-  let summary = "Warp synchronous matrix load";
+/// Class containing information about valid mma matrix types.
+class WMMA_REGS<GEOM Geom, string Frag, string PtxEltType> {
+  int m = Geom.m;
+  int n = Geom.n;
+  int k = Geom.k;
+  string geom = "m"#Geom.m#"n"#Geom.n#"k"#Geom.k;
+  string frag = Frag;
+  string ptx_elt_type = PtxEltType;
+  string gft = geom#":"#Frag#":"#ptx_elt_type;
+}
 
-  string baseDescription = [{"The `nvvm.wmma.m*n*k*.load.[a, b, c]` operation"
-    "loads a matrix collectively using all the threads in a warp."
+//// Generate enum value of the mma.load/mma.store intrinsic.
+class WMMA_NAME_LDST<string Op, WMMA_REGS Frag, string Layout, int WithStride> {
+  string id =   "llvm::Intrinsic::nvvm_wmma"
+                # "_" # Frag.geom
+                # "_" # Op
+                # "_" # Frag.frag
+                # "_" # Frag.ptx_elt_type
+                # "_" # Layout
+                # !if(WithStride, "_stride", "");
+}
 
-    "The operation takes two arguments, the address from where the matrix"
-    "elements are to be loaded from and a stride. The stride argument"
-    "represents the leading dimension of the source matrix. The address and"
-    "the stride are required to be the same across all threads in the warp."
-    "Each thread in a warp holds a certain number of elements. The Op returns"
-    "a LLVMStruct which holds the elements of the matrix held by this thread."
+/// Generate the signature part of the mma intrinsic name.
+class MMA_SIGNATURE<WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> {
+  list<WMMA_REGS> id_frags = !cond(
+     // FP16 ops are identified by accumulator & result type.
+     !eq(A.ptx_elt_type, "f16") : [D, C],
+     // other ops are identified by input types.
+     !ne(A.ptx_elt_type, B.ptx_elt_type): [A, B],
+     true: [A]
+     );
+   string ret = !foldl("", id_frags, a, b, !strconcat(a, "_", b.ptx_elt_type));
+}
 
-    "This op is meant to be used along with `nvvm.wmma.m*n*k*.store` and"
-    "`nvvm.wmma.m*n*k*.mma`."}];
+/// Generate enum value of the wmma.mma intrinsic.
+class WMMA_NAME<string ALayout, string BLayout, WMMA_REGS A, WMMA_REGS B,
+  WMMA_REGS C, WMMA_REGS D> {
+  string signature = MMA_SIGNATURE<A, B, C, D>.ret;
+  string id =   "llvm::Intrinsic::nvvm_wmma"
+                # "_" # A.geom
+                # "_mma"
+                # "_" # ALayout
+                # "_" # BLayout
+                # signature;
+}
 
-  let assemblyFormat = "$args attr-dict `:` functional-type($args, $res)";
+// Generates list of 4-tuples of WMMA_REGS representing a valid MMA op.
+//   Geom: list of supported geometries.
+//   TypeN: PTX type of the corresponding fragment's element.
+//   TypeB and TypeD may be empty if it must match that of TypeA or TypeC.
+class MMA_OPS<list<GEOM> Geom, list<string> TypeA, list<string> TypeB,
+            list<string> TypeC, list<string> TypeD> {
+  list<list<WMMA_REGS>> ret =
+     !foldl([]<list<WMMA_REGS>>, Geom, t1, geom, !listconcat(t1,
+     !foldl([]<list<WMMA_REGS>>, TypeA, t2, type_a, !listconcat(t2,
+     !foldl([]<list<WMMA_REGS>>, !if(!size(TypeB), TypeB, [type_a]), t3, type_b, !listconcat(t3,
+     !foldl([]<list<WMMA_REGS>>, TypeC, t4, type_c, !listconcat(t4,
+     !foldl([]<list<WMMA_REGS>>, !if(!size(TypeD), TypeD, [type_c]), t5, type_d, !listconcat(t5,
+            [[WMMA_REGS<geom, "a", type_a>,
+              WMMA_REGS<geom, "b", type_b>,
+              WMMA_REGS<geom, "c", type_c>,
+              WMMA_REGS<geom, "d", type_d>]]))))))))));
+   // Debugging aid for readable representation of the list above.
+   list<list<string>> ops = !foreach(x, ret, [x[0].gft, x[1].gft, x[2].gft, x[3].gft]);
 }
 
-def NVVM_WMMALoadAM16N16K16Op :
-  NVVM_WMMALoadOp<"wmma.m16n16k16.load.a.f16.row.stride">{
+/// Creates a list of combinations of load/store operations supported.
+class MMA_LDST_OPS<list<GEOM> Geom, list<string> Frags, list<string> Types> {
+  list<WMMA_REGS> ret =
+     !foldl([]<WMMA_REGS>, Geom, t1, geom, !listconcat(t1,
+     !foldl([]<WMMA_REGS>, Frags, t2, frag, !listconcat(t2,
+     !foldl([]<WMMA_REGS>, Types, t3, type, !listconcat(t3,
+            [WMMA_REGS<geom, frag, type>]))))));
+   // Debugging aid for readable representation of the list above.
+   list<string> ops = !foreach(x, ret, x.gft);
+}
 
-  string llvmBuilder = [{
-    $res = createNvvmIntrinsicCall(
-      builder, llvm::Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride, $args);
-  }];
+// Creates list of valid combinations of fragments. This is a subset of what
+// llvm supports and can be extended as needed.
+class NVVM_MMA_OPS {
+  list<list<WMMA_REGS>> tf32_wmma_ops = MMA_OPS<
+            [GEOM<16, 16, 8>],
+            ["tf32"], [], ["f32"], []>.ret;
+  list<list<WMMA_REGS>> fp_wmma_ops = MMA_OPS<
+            [GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],
+            ["f16"], [], ["f16", "f32"], []>.ret;
+  list<list<WMMA_REGS>> all_wmma_ops = !listconcat(
+            tf32_wmma_ops,
+            fp_wmma_ops);
+
+  list<WMMA_REGS> ldst_ab_ops = MMA_LDST_OPS<
+            [GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],
+            ["a", "b"], ["f16"]>.ret;
+  list<WMMA_REGS> ldst_cd_ops = MMA_LDST_OPS<
+            [GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],
+            ["c", "d"], ["f16", "f32"]>.ret;
+  list<WMMA_REGS> ldst_tf32_ab_ops = MMA_LDST_OPS<
+            [GEOM<16, 16, 8>],
+            ["a", "b"], ["tf32"]>.ret;
+  list<WMMA_REGS> ldst_tf32_cd_ops = MMA_LDST_OPS<
+            [GEOM<16, 16, 8>],
+            ["c", "d"], ["f32"]>.ret;
+  list<WMMA_REGS> all_ldst_ops = !listconcat(ldst_ab_ops, ldst_cd_ops,
+                                             ldst_tf32_ab_ops,
+                                             ldst_tf32_cd_ops);
+  // Separate A/B/C fragments (loads) from D (stores).
+  list<WMMA_REGS> all_ld_ops = !filter(op, all_ldst_ops, !ne(op.frag, "d"));
+  list<WMMA_REGS> all_st_ops = !filter(op, all_ldst_ops, !eq(op.frag, "d"));
+}
 
-  string opDescription = [{
-    Example:
+def NVVM_MMA_OPS : NVVM_MMA_OPS;
+
+/// Helper to create the mapping between the configuration and the store
+/// intrinsic enum value.
+class MMA_ST_INTR<string op> {
+  list<list<string>> cond0 = !foreach(frag, NVVM_MMA_OPS.all_st_ops,
+                                !foreach(layout, ["row", "col"],
+  "if (layout == \"" # layout #  "\" && m == " # frag.m # " &&"
+  "    n == " #frag.n # " && k == " # frag.k # " && \"" #
+       frag.ptx_elt_type # "\" == eltype)"
+  "  return " #WMMA_NAME_LDST<op, frag, layout, 1>.id #";"));
+  string id = !foldl("",
+                !foldl([""], cond0, acc, el, !listconcat(acc, el)),
+                acc1, el1, acc1 # "\n" # el1);
+}
 
-    ```mlir
-    %2 = nvvm.wmma.m16n16k16.load.a %0, %1 : !llvm.ptr<i32, 3>, !llvm.i32 ->
-    !llvm.struct<(vec<2 x half>, vec<2 x half>, vec<2 x half>, vec<2 x half>,
-    vec<2 x half>, vec<2 x half>, vec<2 x half>, vec<2 x half>)>
-    ```
-  }];
+/// Helper to map a mxk shape to a supported mxnxk matrix type. This will return
+/// the n value of the supported configuration.
+class MMA_ST_INFER_N<list<WMMA_REGS> ldst> {
+  list<string> cond = !foreach(frag, ldst,
+  "if (m == " # frag.m # " && k == " #frag.k # " && \"" #
+       frag.ptx_elt_type # "\" == eltype)"
+  "  return "# frag.n #";");
+  string id = !foldl("", cond, acc, el, acc # "\n" # el);
+}
 
-  let description = !strconcat(baseDescription, opDescription);
+/// Helper to map a kxn shape to a supported mxnxk matrix type. This will return
+/// the m value of the supported configuration.
+class MMA_ST_INFER_M<list<WMMA_REGS> ldst> {
+  list<string> cond = !foreach(frag, ldst,
+  "if (n == " # frag.n # " && k == " #frag.k # " && \"" #
+       frag.ptx_elt_type # "\" == eltype)"
+  "  return "# frag.m #";");
+  string id = !foldl("", cond, acc, el, acc # "\n" # el);
+}
 
-  let verifier = [{ return ::verify(*this); }];
+/// Helper to map a mxn shape to a supported mxnxk matrix type. This will return
+/// the k value of the supported configuration.
+class MMA_ST_INFER_K<list<WMMA_REGS> ldst> {
+  list<string> cond = !foreach(frag, ldst,
+  "if (m == " # frag.m # " && n == " #frag.n # " && \"" #
+       frag.ptx_elt_type # "\" == eltype)"
+  "  return "# frag.k #";");
+  string id = !foldl("", cond, acc, el, acc # "\n" # el);
 }
 
-def NVVM_WMMALoadBM16N16K16Op :
-  NVVM_WMMALoadOp<"wmma.m16n16k16.load.b.f16.row.stride">{
+/// Helper to create the mapping between the configuration and the load
+/// intrinsic enum value.
+class MMA_LD_INTR<string op> {
+  list<list<string>> cond0 = !foreach(frag, NVVM_MMA_OPS.all_ld_ops,
+                                !foreach(layout, ["row", "col"],
+  "if (layout == \"" # layout #  "\" && m == " # frag.m # " &&"
+  "    n == " #frag.n # " && k == " # frag.k # " && \"" #
+       frag.ptx_elt_type # "\" == eltype && frag == \""#frag.frag#"\")"
+  "  return "# WMMA_NAME_LDST<op, frag, layout, 1>.id #";"));
+  string id = !foldl("",
+                !foldl([""], cond0, acc, el, !listconcat(acc, el)),
+                acc1, el1, acc1 # "\n" # el1);
+}
 
-  string llvmBuilder = [{
-    $res = createNvvmIntrinsicCall(
-      builder, llvm::Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride, $args);
-  }];
+/// Helper to create the mapping between the configuration and the wmma.mma
+/// intrinsic enum value.
+class MMA_MMA_INTR<string opName> {
+  list<list<list<string>>> cond0 =
+    !foreach(op, NVVM_MMA_OPS.all_wmma_ops,
+      !foreach(layoutA, ["row", "col"],
+        !foreach(layoutB, ["row", "col"],
+  "if (layoutA == \"" # layoutA #  "\" && layoutB == \"" # layoutB #  "\" && "
+  "    m == " # op[0].m # " && n == " #op[0].n # " && k == " # op[0].k #
+  "    && \"" # op[0].ptx_elt_type # "\" == eltypeA && \""
+   # op[3].ptx_elt_type # "\" == eltypeB)"
+  "  return " #
+       WMMA_NAME<layoutA, layoutB, op[0], op[1], op[2], op[3]>.id # ";")));
+  list<string> f = !foldl([""],
+                     !foldl([[""]], cond0, acc, el, !listconcat(acc, el)),
+                          acc1, el1, !listconcat(acc1, el1));
+  string id = !foldl("", f, acc, el, acc # "\n" # el);
+}
 
-  string opDescription = [{
-    Example:
+def MMALayoutRow : StrEnumAttrCase<"row">;
+def MMALayoutCol : StrEnumAttrCase<"col">;
 
-    ```mlir
-    %2 = nvvm.wmma.m16n16k16.load.b %0, %1 : !llvm.ptr<i32, 3>, !llvm.i32 ->
-    !llvm.struct<(vec<2 x half>, vec<2 x half>, vec<2 x half>, vec<2 x half>,
-    vec<2 x half>, vec<2 x half>, vec<2 x half>, vec<2 x half>)>
-    ```
-  }];
+/// Enum attribute of the different matrix layout.
+def MMALayout : StrEnumAttr<"MMALayout", "NVVM MMA layout",
+  [MMALayoutRow, MMALayoutCol]> {
+  let cppNamespace = "::mlir::NVVM";
+  let storageType = "mlir::StringAttr";
+  let returnType = "NVVM::MMALayout";
+  let convertFromStorage = "*symbolizeEnum<NVVM::MMALayout>($_self.getValue())";
+  let constBuilderCall = "$_builder.getStringAttr(stringifyEnum($0))";
+}
 
-  let description = !strconcat(baseDescription, opDescription);
+def MMATypeF16 : StrEnumAttrCase<"f16">;
+def MMATypeF32 : StrEnumAttrCase<"f32">;
+def MMATypeTF32 : StrEnumAttrCase<"tf32">;
 
-  let verifier = [{ return ::verify(*this); }];
+/// Enum attribute of the different matrix types.
+def MMATypes : StrEnumAttr<"MMATypes", "NVVM MMA types",
+  [MMATypeF16, MMATypeF32, MMATypeTF32]> {
+  let cppNamespace = "::mlir::NVVM";
+  let storageType = "mlir::StringAttr";
+  let returnType = "NVVM::MMATypes";
+  let convertFromStorage = "*symbolizeEnum<NVVM::MMATypes>($_self.getValue())";
+  let constBuilderCall = "$_builder.getStringAttr(stringifyEnum($0))";
 }
 
-def NVVM_WMMALoadCF16M16N16K16Op :
-  NVVM_WMMALoadOp<"wmma.m16n16k16.load.c.f16.row.stride">{
-  string llvmBuilder = [{
-    $res = createNvvmIntrinsicCall(
-      builder, llvm::Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride, $args);
-  }];
+def MMAFragA : StrEnumAttrCase<"a">;
+def MMAFragB : StrEnumAttrCase<"b">;
+def MMAFragC : StrEnumAttrCase<"c">;
 
-  string opDescription = [{
-    Example:
+/// Enum attribute of the different frag types.
+def MMAFragAttr : StrEnumAttr<"MMAFrag", "NVVM MMA frag type",
+  [MMAFragA, MMAFragB, MMAFragC]> {
+  let cppNamespace = "::mlir::NVVM";
+  let storageType = "mlir::StringAttr";
+  let returnType = "NVVM::MMAFrag";
+  let convertFromStorage = "*symbolizeEnum<NVVM::MMAFrag>($_self.getValue())";
+  let constBuilderCall = "$_builder.getStringAttr(stringifyEnum($0))";
+}
 
-    ```mlir
-    %2 = nvvm.wmma.m16n16k16.load.c.f16.row.stride %0, %1 : !llvm.ptr<i32, 3>, !llvm.i32 ->
-    !llvm.struct<(vec<2 x half>, vec<2 x half>, vec<2 x half>, vec<2 x half>)>
-    ```
-  }];
+def NVVM_WMMALoadOp: NVVM_Op<"wmma.load">,
+  Results<(outs LLVM_AnyStruct:$res)>,
+  Arguments<(ins LLVM_AnyPointer: $ptr, I32: $stride, I32Attr:$m,
+             I32Attr:$n, I32Attr:$k, MMALayout:$layout, MMATypes:$eltype,
+             MMAFragAttr:$frag)> {
 
-  let description = !strconcat(baseDescription, opDescription);
+  let summary = "Warp synchronous matrix load";
+
+  // Since LLVM intrinsic IDs are enum that cannot be dynamically generated in
+  // C++ we instanciate a function in tablegen to map the valide configuration
+  // to the corresponsding intrinsic ID.
+  // Because we want a single source of truth, this mean the source of truth
+  // about valid combinations needs to be in tablgen, therefore we generate
+  // extra helpers to query valid configurations based on the shapes of
+  // load/store operations.
+  let extraClassDeclaration =
+    "static llvm::Intrinsic::ID getIntrinsicID("
+    "int m, int n, int k, mlir::NVVM::MMALayout layoutEnum,"
+    "mlir::NVVM::MMATypes eltypeEnum,mlir::NVVM::MMAFrag fragEnum) {"
+    "llvm::StringRef layout = stringifyEnum(layoutEnum);"
+    "llvm::StringRef eltype = stringifyEnum(eltypeEnum);"
+    "llvm::StringRef frag = stringifyEnum(fragEnum);"
+    #MMA_LD_INTR<"load">.id# "\n"
+    "return 0;"
+    "}\n"
+    "/// Helpers to find valid n dimension based on mxk load shape.\n"
+    "static int inferNDimension(int m, int k, mlir::NVVM::MMATypes eltypeEnum) {"
+    "  llvm::StringRef eltype = stringifyEnum(eltypeEnum);"
+    #MMA_ST_INFER_N<!filter(op, NVVM_MMA_OPS.all_ld_ops, !eq(op.frag, "a"))>.id# "\n"
+    "return 0;"
+    "}\n"
+    "/// Helpers to find valid m dimension based on kxn load shape.\n"
+    "static int inferMDimension(int k, int n, mlir::NVVM::MMATypes eltypeEnum) {"
+    "  llvm::StringRef eltype = stringifyEnum(eltypeEnum);"
+    #MMA_ST_INFER_M<!filter(op, NVVM_MMA_OPS.all_ld_ops, !eq(op.frag, "b"))>.id# "\n"
+    "return 0;"
+    "}\n"
+    "/// Helpers to find valid k dimension based on mxn load shape.\n"
+    "static int inferKDimension(int m, int n, mlir::NVVM::MMATypes eltypeEnum) {"
+    "  llvm::StringRef eltype = stringifyEnum(eltypeEnum);"
+    #MMA_ST_INFER_K<!filter(op, NVVM_MMA_OPS.all_ld_ops, !eq(op.frag, "c"))>.id# "\n"
+    "return 0;"
+    "}\n";
 
-  let verifier = [{ return ::verify(*this); }];
-}
 
-def NVVM_WMMALoadCF32M16N16K16Op :
-  NVVM_WMMALoadOp<"wmma.m16n16k16.load.c.f32.row.stride">{
   string llvmBuilder = [{
-    $res = createNvvmIntrinsicCall(
-      builder, llvm::Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride, $args);
+      auto operands = moduleTranslation.lookupValues(opInst.getOperands());
+      auto intId = mlir::NVVM::WMMALoadOp::getIntrinsicID(
+        $m, $n, $k, $layout, $eltype, $frag);
+      $res = createIntrinsicCall(builder, intId, operands, {operands[0]->getType()});
   }];
 
-  string opDescription = [{
+  string baseDescription = [{
+    The `nvvm.wmma.load` operation loads a matrix collectively using all the
+    threads in a warp.
+
+    The operation takes two arguments, the address from where the matrix
+    elements are to be loaded from and a stride. The stride argument
+    represents the leading dimension of the source matrix. The address and
+    the stride are required to be the same across all threads in the warp.
+    Each thread in a warp holds a certain number of elements. The Op returns
+    a LLVMStruct which holds the elements of the matrix held by this thread.
+
+    This op is meant to be used along with `nvvm.wmma.store` and
+    `nvvm.wmma.mma`.
+
     Example:
 
     ```mlir
-    %2 = nvvm.wmma.m16n16k16.load.c.f32.row.stride %0, %1 : !llvm.ptr<i32, 3>, !llvm.i32 ->
-    !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
+    %2 = nvvm.wmma.load %0, %1
+      {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+      : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     ```
-  }];
-
-  let description = !strconcat(baseDescription, opDescription);
+    }];
 
+  let assemblyFormat = "$ptr `,` $stride attr-dict `:` functional-type($ptr, $res)";
   let verifier = [{ return ::verify(*this); }];
 }
 
-// Base class for all the variants of WMMA storeOps that may be defined.
-class NVVM_WMMAStoreOp<string mnemonic> : NVVM_Op<mnemonic>,
-  Arguments<(ins Variadic<LLVM_Type>:$args)>{
+def NVVM_WMMAStoreOp : NVVM_Op<"wmma.store">,
+  Arguments<(ins LLVM_AnyPointer: $ptr,
+             I32Attr:$m, I32Attr:$n, I32Attr:$k, MMALayout:$layout,
+             MMATypes:$eltype, Variadic<LLVM_Type>:$args, I32: $stride)>{
   let summary = "Warp synchronous matrix store";
 
+  let extraClassDeclaration =
+    "static llvm::Intrinsic::ID getIntrinsicID("
+    "int m, int n, int k, mlir::NVVM::MMALayout layoutEnum,"
+    "mlir::NVVM::MMATypes eltypeEnum) {"
+    "  llvm::StringRef layout = stringifyEnum(layoutEnum);"
+    "  llvm::StringRef eltype = stringifyEnum(eltypeEnum);"
+    #MMA_ST_INTR<"store">.id# "\n"
+    "return 0;"
+    "}\n"
+    "/// Helpers to find valid k dimension based on mxn store shape.\n"
+    "static int inferKDimension(int m, int n, mlir::NVVM::MMATypes eltypeEnum) {"
+    "  llvm::StringRef eltype = stringifyEnum(eltypeEnum);"
+    #MMA_ST_INFER_K<NVVM_MMA_OPS.all_st_ops>.id#  "\n"
+    "return 0;"
+    "}";
+
+  string llvmBuilder = [{
+      auto operands = moduleTranslation.lookupValues(opInst.getOperands());
+      auto intId =
+        mlir::NVVM::WMMAStoreOp::getIntrinsicID($m, $n, $k, $layout, $eltype);
+      createIntrinsicCall(builder, intId, operands, {operands[0]->getType()});
+  }];
+
   string baseDescription = [{
-    The `nvvm.wmma.m*n*k*.store` operation stores a matrix collectively using
+    The `nvvm.wmma.store` operation stores a matrix collectively using
     all the threads in a warp.
 
     The operation takes as arguments the address to where the matrix elements are
@@ -279,60 +510,50 @@ class NVVM_WMMAStoreOp<string mnemonic> : NVVM_Op<mnemonic>,
 
     This op is meant to be used along with `nvvm.wmma.m16n16k16.load` and
     `nvvm.wmma.m16n16k16.mma`.
-  }];
-
-  let assemblyFormat = "$args attr-dict `:` type($args)";
-}
-
-def NVVM_WMMAStoreF16M16N16K16Op : NVVM_WMMAStoreOp<"wmma.m16n16k16.store.d.f16.row.stride"> {
-  string llvmBuilder = [{
-        createNvvmIntrinsicCall(
-        builder, llvm::Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride, $args);
-  }];
-
-  string opDescription = [{
-    Example:
-
-    ```mlir
-    nvvm.wmma.m16n16k16.stored.f16.row.stride %0, %1, %2, %3, %4, %5, %6 : !llvm.ptr<i32, 3>,
-    !llvm.struct<(vec<2 x half>, vec<2 x half>, vec<2 x half>, vec<2 x half>)>, !llvm.i32
-    ```
-  }];
-
-  let description = !strconcat(baseDescription, opDescription);
-
-  let verifier = [{ return ::verify(*this); }];
-}
 
-def NVVM_WMMAStoreF32M16N16K16Op : NVVM_WMMAStoreOp<"wmma.m16n16k16.store.d.f32.row.stride"> {
-  string llvmBuilder = [{
-        createNvvmIntrinsicCall(
-        builder, llvm::Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride, $args);
-  }];
-
-  string opDescription = [{
     Example:
 
     ```mlir
-    nvvm.wmma.m16n16k16.store.d.f32.row.stride %0, %1, %2, %3, %4, %5, %6, %7, %8, %9,
-    %10 : !llvm.ptr<i32, 3>, !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>,
-    !llvm.i32
+    nvvm.wmma.store %0, %1, %2, %3, %4, %5
+      {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+      : !llvm.ptr<i32, 3>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>
     ```
   }];
 
-  let description = !strconcat(baseDescription, opDescription);
-
+  let assemblyFormat = "$ptr `,` $stride `,` $args attr-dict `:` type($ptr) `,` type($args)";
   let verifier = [{ return ::verify(*this); }];
 }
 
 // Base class for all the variants of WMMA mmaOps that may be defined.
-class NVVM_WMMAMmaOp<string mnemonic> : NVVM_Op<mnemonic>,
+def NVVM_WMMAMmaOp : NVVM_Op<"wmma.mma">,
   Results<(outs LLVM_AnyStruct:$res)>,
-  Arguments<(ins Variadic<LLVM_Type>:$args)>{
+  Arguments<(ins I32Attr:$m, I32Attr:$n, I32Attr:$k, MMALayout:$layoutA,
+            MMALayout:$layoutB, MMATypes:$eltypeA, MMATypes:$eltypeB,
+            Variadic<LLVM_Type>:$args)>{
   let summary = "Warp synchronous matrix-multiply accumulate using tensor cores.";
 
+  let extraClassDeclaration =
+    "static llvm::Intrinsic::ID getIntrinsicID("
+    "int m, int n, int k, mlir::NVVM::MMALayout layoutAEnum,"
+    "mlir::NVVM::MMALayout layoutBEnum, mlir::NVVM::MMATypes eltypeAEnum,"
+    "mlir::NVVM::MMATypes eltypeBEnum) {"
+    "llvm::StringRef layoutA = stringifyEnum(layoutAEnum);"
+    "llvm::StringRef layoutB = stringifyEnum(layoutBEnum);"
+    "llvm::StringRef eltypeA = stringifyEnum(eltypeAEnum);"
+    "llvm::StringRef eltypeB = stringifyEnum(eltypeBEnum);"
+    #MMA_MMA_INTR<"mma">.id# "\n"
+    "return 0;"
+    "}";
+
+  string llvmBuilder = [{
+      auto operands = moduleTranslation.lookupValues(opInst.getOperands());
+      auto intId = mlir::NVVM::WMMAMmaOp::getIntrinsicID(
+        $m, $n, $k, $layoutA, $layoutB, $eltypeA, $eltypeB);
+      $res = createIntrinsicCall(builder, intId, operands);
+  }];
+
   string baseDescription = [{
-    The `nvvm.wmma.m*n*k*.mma` operation performs a matrix-multiply accumulate
+    The `nvvm.wmma.mma` operation performs a matrix-multiply accumulate
     (mma) operation using all the threads in a warp.
 
     The operation performed is represented as `D = A * B + C`. The operation takes
@@ -340,64 +561,20 @@ class NVVM_WMMAMmaOp<string mnemonic> : NVVM_Op<mnemonic>,
     current thread. The op returns a LLVM struct which holds a part of the result
     held by the current thread.
 
-    This op is meant to be used along with `nvvm.wmma.m16n16k16.load` and `nvvm.wmma.
-    m16n16k16.store`.
-  }];
-}
-
-def NVVM_WMMAMmaF16F16M16N16K16Op : NVVM_WMMAMmaOp<"wmma.m16n16k16.mma.row.row.f16.f16">{
-  string llvmBuilder = [{
-    $res = createNvvmIntrinsicCall(
-        builder, llvm::Intrinsic::nvvm_wmma_m16n16k16_mma_row_row_f16_f16, $args);
-  }];
-
-  string opDescription = [{
-    Example:
-
-    ```mlir
-    %20 = nvvm.wmma.m16n16k16.mma.row.row.f16.f16 %0, %1, %2, %3, %4, %5, %6, %7, %8,
-    %9, %10, %11, %12, %13, %14, %15, %16, %17, %18, %19 : vector<2xf16> -> !llvm.struct
-    <(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-    ```
-  }];
-
-  let parser = [{
-    return parseWMMAMmaF16F16M16N16K16Op(parser, result);
-  }];
-
-  let printer = [{
-    printWMMAMmaF16F16M16N16K16Op(p, *this);
-  }];
+    This op is meant to be used along with `nvvm.wmma.load` and
+    `nvvm.wmma.store`.
 
-  let description = !strconcat(baseDescription, opDescription);
-
-  let verifier = [{ return ::verify(*this); }];
-}
-
-def NVVM_WMMAMmaF32F32M16N16K16Op : NVVM_WMMAMmaOp<"wmma.m16n16k16.mma.row.row.f32.f32">{
-  string llvmBuilder = [{
-    $res = createNvvmIntrinsicCall(
-        builder, llvm::Intrinsic::nvvm_wmma_m16n16k16_mma_row_row_f32_f32, $args);
-  }];
-
-  string opDescription = [{
     Example:
 
     ```mlir
-    %24 = nvvm.wmma.m16n16k16.mma.row.row.f32.f32 %0, %1, %2, %3, %4, %5, %6, %7, %8
-    %9, %10, %11, %12, %13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23 :
-    (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>,
-    vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>,
-    vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>,
-    vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32,
-    f32, f32, f32, f32, f32, f32, f32)>
+    %16 = nvvm.wmma.mma %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15
+      {eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+      : (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)
+      -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
     ```
   }];
 
   let assemblyFormat = "$args attr-dict `:` functional-type($args, $res)";
-
-  let description = !strconcat(baseDescription, opDescription);
-
   let verifier = [{ return ::verify(*this); }];
 }
 
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index c95488d14a49b..cdba5a8094931 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -359,14 +359,6 @@ llvm::Value *createIntrinsicCall(llvm::IRBuilderBase &builder,
                                  llvm::Intrinsic::ID intrinsic,
                                  ArrayRef<llvm::Value *> args = {},
                                  ArrayRef<llvm::Type *> tys = {});
-
-/// Creates a call to an LLVM IR intrinsic function with the given arguments
-/// for NVVM WMMA ops. Handles cases where the intrinsic name is overloaded
-/// using the types of arguments supplied. Selects the correct intrinsic
-/// by inspecting the argument types.
-llvm::Value *createNvvmIntrinsicCall(llvm::IRBuilderBase &builder,
-                                     llvm::Intrinsic::ID intrinsic,
-                                     ArrayRef<llvm::Value *> args = {});
 } // namespace detail
 
 } // namespace LLVM
diff --git a/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp b/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
index 681725441539f..49d48bf2d630b 100644
--- a/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
@@ -36,26 +36,36 @@ static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands,
   return success();
 }
 
-/// Error string to emit when unimplemented WMMA variant is encountered.
-static constexpr StringRef kInvalidCaseStr =
-    "Unimplemented WMMA variant, Only M16N16K16 version implemented.";
+/// Error string to emit when an unimplemented WMMA variant is encountered.
+static constexpr StringRef kInvalidCaseStr = "Unsupported WMMA variant.";
+
+static NVVM::MMAFrag convertOperand(StringRef operandName) {
+  if (operandName.equals("AOp"))
+    return NVVM::MMAFrag::a;
+  if (operandName.equals("BOp"))
+    return NVVM::MMAFrag::b;
+  if (operandName.equals("COp"))
+    return NVVM::MMAFrag::c;
+  llvm_unreachable("Unknown operand name");
+}
+
+static NVVM::MMATypes getElementType(gpu::MMAMatrixType type) {
+  if (type.getElementType().isF16())
+    return NVVM::MMATypes::f16;
+  if (type.getElementType().isF32())
+    return type.getOperand().equals("COp") ? NVVM::MMATypes::f32
+                                           : NVVM::MMATypes::tf32;
+  llvm_unreachable("Unsupported type");
+}
 
 /// Return the LLVMStructureType corresponding to the MMAMatrixType `type`.
 static LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type) {
-  StringRef operandStr = type.getOperand();
-  assert(type.getElementType().isa<FloatType>());
-  Type baseType = type.getElementType().isF16()
-                      ? VectorType::get(2, type.getElementType())
-                      : type.getElementType();
-  auto getLLVMType = [&](int64_t numElements) {
-    return LLVM::LLVMStructType::getLiteral(
-        type.getContext(), SmallVector<Type, 8>(numElements, baseType));
-  };
-  if (operandStr.equals("AOp") || operandStr.equals("BOp"))
-    return getLLVMType(8);
-  if (type.getElementType().isF16())
-    return getLLVMType(4);
-  return getLLVMType(8);
+  NVVM::MMAFrag frag = convertOperand(type.getOperand());
+  NVVM::MMATypes eltType = getElementType(type);
+  std::pair<Type, unsigned> typeInfo =
+      inferMMAType(eltType, frag, type.getContext());
+  return LLVM::LLVMStructType::getLiteral(
+      type.getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));
 }
 
 /// This class implements the conversion of GPU MMA loadOp to wmma.load op
@@ -118,41 +128,41 @@ struct WmmaLoadOpToNVVMLowering
     gpu::MMAMatrixType retType =
         subgroupMmaLoadMatrixOp.res().getType().cast<gpu::MMAMatrixType>();
     ArrayRef<int64_t> retTypeShape = retType.getShape();
+    int64_t m = 0;
+    int64_t n = 0;
+    int64_t k = 0;
+    NVVM::MMATypes eltype = getElementType(retType);
+    // NVVM intrinsics require to give mxnxk dimensions, infer the missing
+    // dimension based on the valid intrinsics available.
+    if (retType.getOperand().equals("AOp")) {
+      m = retTypeShape[0];
+      k = retTypeShape[1];
+      n = NVVM::WMMALoadOp::inferNDimension(m, k, eltype);
+    } else if (retType.getOperand().equals("BOp")) {
+      k = retTypeShape[0];
+      n = retTypeShape[1];
+      m = NVVM::WMMALoadOp::inferMDimension(k, n, eltype);
+    } else if (retType.getOperand().equals("COp")) {
+      m = retTypeShape[0];
+      n = retTypeShape[1];
+      k = NVVM::WMMALoadOp::inferKDimension(m, n, eltype);
+    }
+    NVVM::MMALayout layout = NVVM::MMALayout::row;
+    NVVM::MMAFrag frag = convertOperand(retType.getOperand());
+    // Check that there is an exisiting instruction for the combination we need.
+    if (NVVM::WMMALoadOp::getIntrinsicID(m, n, k, layout, eltype, frag) == 0)
+      return rewriter.notifyMatchFailure(op, kInvalidCaseStr);
 
     Type resType = convertMMAToLLVMType(retType);
-    StringRef operandStr = retType.getOperand();
 
     // Create nvvm.mma_load op according to the operand types.
     Value leadingDim32 = rewriter.create<LLVM::ConstantOp>(
         loc, rewriter.getI32Type(), leadDimension);
-    SmallVector<Value, 2> loadOpOperands({loadAddressCasted, leadingDim32});
-    if (operandStr.equals("AOp")) {
-      if (retTypeShape[0] == 16 && retTypeShape[1] == 16) {
-        rewriter.replaceOpWithNewOp<NVVM::WMMALoadAM16N16K16Op>(op, resType,
-                                                                loadOpOperands);
-      } else {
-        return rewriter.notifyMatchFailure(op, kInvalidCaseStr);
-      }
-    } else if (operandStr.equals("BOp")) {
-      if (retTypeShape[0] == 16 && retTypeShape[1] == 16) {
-        rewriter.replaceOpWithNewOp<NVVM::WMMALoadBM16N16K16Op>(op, resType,
-                                                                loadOpOperands);
-      } else {
-        return rewriter.notifyMatchFailure(op, kInvalidCaseStr);
-      }
-    } else {
-      if (retTypeShape[0] == 16 && retTypeShape[1] == 16) {
-        if (retType.getElementType().isF16()) {
-          rewriter.replaceOpWithNewOp<NVVM::WMMALoadCF16M16N16K16Op>(
-              op, resType, loadOpOperands);
-        } else if (retType.getElementType().isF32()) {
-          rewriter.replaceOpWithNewOp<NVVM::WMMALoadCF32M16N16K16Op>(
-              op, resType, loadOpOperands);
-        }
-      } else {
-        return rewriter.notifyMatchFailure(op, kInvalidCaseStr);
-      }
-    }
+
+    rewriter.replaceOpWithNewOp<NVVM::WMMALoadOp>(
+        op, resType, loadAddressCasted, leadingDim32, m, n, k, layout, eltype,
+        frag);
+
     return success();
   }
 };
@@ -212,13 +222,18 @@ struct WmmaStoreOpToNVVMLowering
         storeAddress);
 
     SmallVector<Value, 4> storeOpOperands;
-    storeOpOperands.push_back(storeAddressCasted);
-
     // Get the shape of the MMAMatrix type being stored. The shape will
     // choose which intrinsic this op will be lowered to.
     gpu::MMAMatrixType srcType =
         subgroupMmaStoreMatrixOp.src().getType().cast<gpu::MMAMatrixType>();
     ArrayRef<int64_t> srcTypeShape = srcType.getShape();
+    NVVM::MMALayout layout = NVVM::MMALayout::row;
+    NVVM::MMATypes eltype = getElementType(srcType);
+    int64_t m = srcTypeShape[0];
+    int64_t n = srcTypeShape[1];
+    int64_t k = NVVM::WMMAStoreOp::inferKDimension(m, n, eltype);
+    if (NVVM::WMMAStoreOp::getIntrinsicID(m, n, k, layout, eltype) == 0)
+      return rewriter.notifyMatchFailure(op, kInvalidCaseStr);
 
     auto matrixType = adaptor.src().getType().cast<LLVM::LLVMStructType>();
     for (unsigned i = 0, e = matrixType.getBody().size(); i < e; ++i) {
@@ -229,29 +244,11 @@ struct WmmaStoreOpToNVVMLowering
     }
     Value leadingDim32 = rewriter.create<LLVM::ConstantOp>(
         loc, rewriter.getI32Type(), leadDimension);
-    storeOpOperands.push_back(leadingDim32);
-    // Unpack the results from the source.
-    if (srcType.getElementType().isF16()) {
-      // Create nvvm.mma_store op.
-      if (srcTypeShape[0] == 16 && srcTypeShape[1] == 16) {
-        rewriter.create<NVVM::WMMAStoreF16M16N16K16Op>(loc, storeOpOperands);
-      } else {
-        return rewriter.notifyMatchFailure(op, kInvalidCaseStr);
-      }
-      rewriter.eraseOp(op);
-      return success();
-    }
-    if (srcType.getElementType().isF32()) {
-      // Create nvvm.mma_store op.
-      if (srcTypeShape[0] == 16 && srcTypeShape[1] == 16)
-        rewriter.create<NVVM::WMMAStoreF32M16N16K16Op>(loc, storeOpOperands);
-      else {
-        return rewriter.notifyMatchFailure(op, kInvalidCaseStr);
-      }
-      rewriter.eraseOp(op);
-      return success();
-    }
-    return failure();
+    rewriter.create<NVVM::WMMAStoreOp>(loc, storeAddressCasted, m, n, k, layout,
+                                       eltype, storeOpOperands, leadingDim32);
+
+    rewriter.eraseOp(op);
+    return success();
   }
 };
 
@@ -292,40 +289,27 @@ struct WmmaMmaOpToNVVMLowering
     gpu::MMAMatrixType aType =
         subgroupMmaComputeOp.opA().getType().cast<gpu::MMAMatrixType>();
     ArrayRef<int64_t> aTypeShape = aType.getShape();
-    gpu::MMAMatrixType bType =
-        subgroupMmaComputeOp.opB().getType().cast<gpu::MMAMatrixType>();
-    ArrayRef<int64_t> bTypeShape = bType.getShape();
     gpu::MMAMatrixType cType =
         subgroupMmaComputeOp.opC().getType().cast<gpu::MMAMatrixType>();
     ArrayRef<int64_t> cTypeShape = cType.getShape();
+    int64_t m = cTypeShape[0];
+    int64_t n = cTypeShape[1];
+    int64_t k = aTypeShape[1];
+    NVVM::MMALayout layout = NVVM::MMALayout::row;
+    NVVM::MMATypes sourceType = getElementType(aType);
+    NVVM::MMATypes destType = getElementType(cType);
+    if (NVVM::WMMAMmaOp::getIntrinsicID(m, n, k, layout, layout, sourceType,
+                                        destType) == 0)
+      return rewriter.notifyMatchFailure(op, kInvalidCaseStr);
 
     unpackOp(adaptor.opA());
     unpackOp(adaptor.opB());
     unpackOp(adaptor.opC());
 
-    if (cType.getElementType().isF16()) {
-      if (aTypeShape[0] == 16 && aTypeShape[1] == 16 && bTypeShape[0] == 16 &&
-          bTypeShape[1] == 16 && cTypeShape[0] == 16 && cTypeShape[1] == 16) {
-        // Create nvvm.wmma.mma op.
-        rewriter.replaceOpWithNewOp<NVVM::WMMAMmaF16F16M16N16K16Op>(
-            op, adaptor.opC().getType(), unpackedOps);
-
-        return success();
-      }
-      return rewriter.notifyMatchFailure(op, kInvalidCaseStr);
-    }
-    if (cType.getElementType().isF32()) {
-      if (aTypeShape[0] == 16 && aTypeShape[1] == 16 && bTypeShape[0] == 16 &&
-          bTypeShape[1] == 16 && cTypeShape[0] == 16 && cTypeShape[1] == 16) {
-        // Create nvvm.wmma.mma op.
-        rewriter.replaceOpWithNewOp<NVVM::WMMAMmaF32F32M16N16K16Op>(
-            op, adaptor.opC().getType(), unpackedOps);
-
-        return success();
-      }
-      return rewriter.notifyMatchFailure(op, kInvalidCaseStr);
-    }
-    return failure();
+    rewriter.replaceOpWithNewOp<NVVM::WMMAMmaOp>(
+        op, adaptor.opC().getType(), m, n, k, layout, layout, sourceType,
+        destType, unpackedOps);
+    return success();
   }
 };
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 6cf166f41cd18..ff82f9e1eaa66 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -31,6 +31,7 @@ using namespace mlir;
 using namespace NVVM;
 
 #include "mlir/Dialect/LLVMIR/NVVMOpsDialect.cpp.inc"
+#include "mlir/Dialect/LLVMIR/NVVMOpsEnums.cpp.inc"
 
 //===----------------------------------------------------------------------===//
 // Printing/parsing for NVVM ops
@@ -132,201 +133,100 @@ static LogicalResult verify(MmaOp op) {
   return op.emitOpError("unimplemented mma.sync variant");
 }
 
-template <typename T>
-static LogicalResult verifyWMMALoadOp(T op, StringRef operand) {
-  MLIRContext *context = op.getContext();
-  auto i32Ty = IntegerType::get(context, 32);
-  auto i32Ptr1Ty = LLVM::LLVMPointerType::get(i32Ty, 1);
-  auto i32Ptr3Ty = LLVM::LLVMPointerType::get(i32Ty, 3);
-  auto i32Ptr0Ty = LLVM::LLVMPointerType::get(i32Ty, 0);
-  auto f16Ty = FloatType::getF16(context);
-  auto f32Ty = FloatType::getF32(context);
-  auto f16x2Ty = VectorType::get(2, f16Ty);
-  auto f16x2x4StructTy = LLVM::LLVMStructType::getLiteral(
-      context, {f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty});
-  auto f16x2x8StructTy = LLVM::LLVMStructType::getLiteral(
-      context,
-      {f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty});
-  auto f32x8StructTy = LLVM::LLVMStructType::getLiteral(
-      context, {f32Ty, f32Ty, f32Ty, f32Ty, f32Ty, f32Ty, f32Ty, f32Ty});
-
-  SmallVector<Type, 2> operandTypes(op.getOperandTypes().begin(),
-                                    op.getOperandTypes().end());
-  if (operandTypes != SmallVector<Type, 2>{i32Ptr1Ty, i32Ty} &&
-      operandTypes != SmallVector<Type, 2>{i32Ptr3Ty, i32Ty} &&
-      operandTypes != SmallVector<Type, 2>{i32Ptr0Ty, i32Ty}) {
-    return op.emitOpError("expected operands to be a source pointer in memory "
-                          "space 0, 1, 3 followed by ldm of the source");
+std::pair<mlir::Type, unsigned>
+inferMMAType(NVVM::MMATypes type, NVVM::MMAFrag frag, MLIRContext *context) {
+  unsigned numberElements = 0;
+  Type elementType;
+  OpBuilder builder(context);
+  Type f16x2 = VectorType::get(2, builder.getF16Type());
+  if (type == NVVM::MMATypes::f16) {
+    elementType = f16x2;
+    if (frag == NVVM::MMAFrag::a || frag == NVVM::MMAFrag::b)
+      numberElements = 8;
+    else
+      numberElements = 4;
+  } else if (type == NVVM::MMATypes::f32) {
+    elementType = builder.getF32Type();
+    numberElements = 8;
+  } else if (type == NVVM::MMATypes::tf32) {
+    elementType = builder.getI32Type();
+    numberElements = 4;
   }
-
-  if (operand.equals("AOp") || operand.equals("BOp")) {
-    if (op.getType() != f16x2x8StructTy) {
-      return op.emitOpError("expected result type of loadAOp and loadBOp to be "
-                            "a struct of 8 <halfx2>s");
-    }
-  } else if (operand.equals("COp")) {
-    if (op.getType() != f16x2x4StructTy && op.getType() != f32x8StructTy) {
-      return op.emitOpError("expected result type of loadCOp to be a struct of "
-                            "4 <halfx2>s or 8 f32s");
-    }
-  }
-
-  return success();
-}
-
-static LogicalResult verify(WMMALoadAM16N16K16Op op) {
-  return verifyWMMALoadOp(op, "AOp");
-}
-
-static LogicalResult verify(WMMALoadBM16N16K16Op op) {
-  return verifyWMMALoadOp(op, "BOp");
-}
-
-static LogicalResult verify(WMMALoadCF16M16N16K16Op op) {
-  return verifyWMMALoadOp(op, "COp");
-}
-
-static LogicalResult verify(WMMALoadCF32M16N16K16Op op) {
-  return verifyWMMALoadOp(op, "COp");
-}
-
-template <typename T>
-static bool verifyWMMAStoreOp(T op, SmallVector<Type> &containedElems) {
-  SmallVector<Type> operandTypes(op.getOperandTypes().begin(),
-                                 op.getOperandTypes().end());
-  if (operandTypes == containedElems)
-    return true;
-
-  return false;
-}
-
-static LogicalResult verify(WMMAStoreF16M16N16K16Op op) {
-  MLIRContext *context = op.getContext();
-  auto i32Ty = IntegerType::get(context, 32);
-  auto i32Ptr1Ty = LLVM::LLVMPointerType::get(i32Ty, 1);
-  auto i32Ptr3Ty = LLVM::LLVMPointerType::get(i32Ty, 3);
-  auto i32Ptr0Ty = LLVM::LLVMPointerType::get(i32Ty, 0);
-  auto f16Ty = FloatType::getF16(context);
-  auto f16x2Ty = VectorType::get(2, f16Ty);
-  SmallVector<Type> type1{i32Ptr1Ty, f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty, i32Ty};
-  SmallVector<Type> type0{i32Ptr0Ty, f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty, i32Ty};
-  SmallVector<Type> type3{i32Ptr3Ty, f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty, i32Ty};
-  if (verifyWMMAStoreOp(op, type1) || verifyWMMAStoreOp(op, type0) ||
-      verifyWMMAStoreOp(op, type3))
-    return success();
-
-  return op.emitOpError("expected operands to be a source pointer in memory"
-                        "space 0, 1, 3 followed by ldm of the source");
-}
-
-static LogicalResult verify(WMMAStoreF32M16N16K16Op op) {
-  MLIRContext *context = op.getContext();
-  auto i32Ty = IntegerType::get(context, 32);
-  auto i32Ptr1Ty = LLVM::LLVMPointerType::get(i32Ty, 1);
-  auto i32Ptr3Ty = LLVM::LLVMPointerType::get(i32Ty, 3);
-  auto i32Ptr0Ty = LLVM::LLVMPointerType::get(i32Ty, 0);
-  auto f32Ty = FloatType::getF32(context);
-
-  SmallVector<Type> type1{i32Ptr1Ty, f32Ty, f32Ty, f32Ty, f32Ty,
-                          f32Ty,     f32Ty, f32Ty, f32Ty, i32Ty};
-  SmallVector<Type> type0{i32Ptr0Ty, f32Ty, f32Ty, f32Ty, f32Ty,
-                          f32Ty,     f32Ty, f32Ty, f32Ty, i32Ty};
-  SmallVector<Type> type3{i32Ptr3Ty, f32Ty, f32Ty, f32Ty, f32Ty,
-                          f32Ty,     f32Ty, f32Ty, f32Ty, i32Ty};
-  if (verifyWMMAStoreOp(op, type0) || verifyWMMAStoreOp(op, type1) ||
-      verifyWMMAStoreOp(op, type3))
-    return success();
-
-  return op.emitOpError("expected operands to be a source pointer in memory"
-                        "space 0, 1, 3 followed by ldm of the source");
+  assert(numberElements != 0 && elementType != nullptr);
+  return std::make_pair(elementType, numberElements);
 }
 
-static LogicalResult verify(WMMAMmaF16F16M16N16K16Op op) {
-  MLIRContext *context = op.getContext();
-  auto f16Ty = FloatType::getF16(context);
-  auto f16x2Ty = VectorType::get(2, f16Ty);
-  auto f16x2x4StructTy = LLVM::LLVMStructType::getLiteral(
-      context, {f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty});
-
-  SmallVector<Type, 2> operandTypes(op.getOperandTypes().begin(),
-                                    op.getOperandTypes().end());
-  if (operandTypes != SmallVector<Type, 20>(20, f16x2Ty))
-    return op.emitOpError("expected 20 <halfx2>s as operands");
-
-  if (op.getResult().getType() != f16x2x4StructTy)
-    return op.emitOpError("expected result type to be a struct of 4 <halfx2>s");
-
+static LogicalResult verify(NVVM::WMMALoadOp op) {
+  unsigned addressSpace =
+      op.ptr().getType().cast<LLVM::LLVMPointerType>().getAddressSpace();
+  if (addressSpace != 0 && addressSpace != 1 && addressSpace != 3)
+    return op.emitOpError("expected source pointer in memory "
+                          "space 0, 1, 3");
+
+  if (NVVM::WMMALoadOp::getIntrinsicID(op.m(), op.n(), op.k(), op.layout(),
+                                       op.eltype(), op.frag()) == 0)
+    return op.emitOpError() << "invalid attribute combination";
+  std::pair<Type, unsigned> typeInfo =
+      inferMMAType(op.eltype(), op.frag(), op.getContext());
+  Type dstType = LLVM::LLVMStructType::getLiteral(
+      op.getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));
+  if (op.getType() != dstType)
+    return op.emitOpError("expected destination type is a structure of ")
+           << typeInfo.second << " elements of type " << typeInfo.first;
   return success();
 }
 
-static LogicalResult parseWMMAMmaF16F16M16N16K16Op(OpAsmParser &parser,
-                                                   OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 4> operands;
-  ::llvm::SMLoc operandsLoc;
-  Type operandType;
-  Type resType;
-
-  operandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperandList(operands) ||
-      parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() ||
-      parser.parseType(operandType) || parser.parseArrow())
-    return failure();
-
-  unsigned numOperands = operands.size();
-  SmallVector<Type> operandTypes(numOperands, operandType);
-  if (parser.parseType(resType))
-    return failure();
-  result.addTypes(resType);
-  if (parser.resolveOperands(operands, operandTypes, operandsLoc,
-                             result.operands))
-    return failure();
+static LogicalResult verify(NVVM::WMMAStoreOp op) {
+  unsigned addressSpace =
+      op.ptr().getType().cast<LLVM::LLVMPointerType>().getAddressSpace();
+  if (addressSpace != 0 && addressSpace != 1 && addressSpace != 3)
+    return op.emitOpError("expected operands to be a source pointer in memory "
+                          "space 0, 1, 3");
+
+  if (NVVM::WMMAStoreOp::getIntrinsicID(op.m(), op.n(), op.k(), op.layout(),
+                                        op.eltype()) == 0)
+    return op.emitOpError() << "invalid attribute combination";
+  std::pair<Type, unsigned> typeInfo =
+      inferMMAType(op.eltype(), NVVM::MMAFrag::c, op.getContext());
+  if (op.args().size() != typeInfo.second)
+    return op.emitOpError()
+           << "expected " << typeInfo.second << " data operands";
+  if (llvm::any_of(op.args(), [&typeInfo](Value operands) {
+        return operands.getType() != typeInfo.first;
+      }))
+    return op.emitOpError()
+           << "expected data operands of type " << typeInfo.first;
   return success();
 }
 
-static void printWMMAMmaF16F16M16N16K16Op(OpAsmPrinter &p,
-                                          WMMAMmaF16F16M16N16K16Op &op) {
-  p << ' ';
-  p << op.args();
-  p.printOptionalAttrDict(op->getAttrs(), {});
-  p << " : ";
-  p << op->getOperand(0).getType();
-  p << ' ' << "->";
-  p << ' ';
-  p << ::llvm::ArrayRef<::mlir::Type>(op.res().getType());
-}
-
-static LogicalResult verify(WMMAMmaF32F32M16N16K16Op op) {
-  unsigned numABOperands = 16;
-  unsigned numCOperands = 8;
-  MLIRContext *context = op.getContext();
-  auto f16Ty = FloatType::getF16(context);
-  auto f32Ty = FloatType::getF32(context);
-  auto f16x2Ty = VectorType::get(2, f16Ty);
-  auto f32x8StructTy = LLVM::LLVMStructType::getLiteral(
-      context, {f32Ty, f32Ty, f32Ty, f32Ty, f32Ty, f32Ty, f32Ty, f32Ty});
-
-  SmallVector<Type> abOpTypes;
-  SmallVector<Type> bOpTypes;
-  SmallVector<Type> cOpTypes;
-
-  for (auto operand : op->getOperands().take_front(numABOperands)) {
-    abOpTypes.push_back(operand.getType());
-  }
-
-  for (auto operand :
-       op->getOperands().drop_front(numABOperands).take_front(numCOperands)) {
-    cOpTypes.push_back(operand.getType());
+static LogicalResult verify(NVVM::WMMAMmaOp op) {
+  if (NVVM::WMMAMmaOp::getIntrinsicID(op.m(), op.n(), op.k(), op.layoutA(),
+                                      op.layoutB(), op.eltypeA(),
+                                      op.eltypeB()) == 0)
+    return op.emitOpError() << "invalid attribute combination";
+  std::pair<Type, unsigned> typeInfoA =
+      inferMMAType(op.eltypeA(), NVVM::MMAFrag::a, op.getContext());
+  std::pair<Type, unsigned> typeInfoB =
+      inferMMAType(op.eltypeA(), NVVM::MMAFrag::b, op.getContext());
+  std::pair<Type, unsigned> typeInfoC =
+      inferMMAType(op.eltypeB(), NVVM::MMAFrag::c, op.getContext());
+  SmallVector<Type, 32> arguments;
+  arguments.append(typeInfoA.second, typeInfoA.first);
+  arguments.append(typeInfoB.second, typeInfoB.first);
+  arguments.append(typeInfoC.second, typeInfoC.first);
+  unsigned numArgs = arguments.size();
+  if (op.args().size() != numArgs)
+    return op.emitOpError() << "expected " << numArgs << " arguments";
+  for (unsigned i = 0; i < numArgs; i++) {
+    if (op.args()[i].getType() != arguments[i])
+      return op.emitOpError()
+             << "expected argument " << i << " to be of type " << arguments[i];
   }
-
-  if (abOpTypes != SmallVector<Type>(16, f16x2Ty))
-    return op.emitOpError("expected 16 <halfx2>s for `a` and `b` operand");
-
-  if (cOpTypes != SmallVector<Type>(8, f32Ty))
-    return op.emitOpError("expected 8 f32s for `c` operand");
-
-  if (op.getResult().getType() != f32x8StructTy)
-    return op.emitOpError("expected result type to be a struct of 8 f32s");
-
+  Type dstType = LLVM::LLVMStructType::getLiteral(
+      op.getContext(), SmallVector<Type, 8>(typeInfoC.second, typeInfoC.first));
+  if (op.getType() != dstType)
+    return op.emitOpError("expected destination type is a structure of ")
+           << typeInfoC.second << " elements of type " << typeInfoC.first;
   return success();
 }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index 7675387690a44..bdcb451323add 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -22,7 +22,6 @@
 using namespace mlir;
 using namespace mlir::LLVM;
 using mlir::LLVM::detail::createIntrinsicCall;
-using mlir::LLVM::detail::createNvvmIntrinsicCall;
 
 static llvm::Intrinsic::ID getShflBflyIntrinsicId(llvm::Type *resultType,
                                                   bool withPredicate) {
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index e9c57bab97dea..39b83c2255654 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -443,29 +443,6 @@ llvm::Value *mlir::LLVM::detail::createIntrinsicCall(
   return builder.CreateCall(fn, args);
 }
 
-llvm::Value *
-mlir::LLVM::detail::createNvvmIntrinsicCall(llvm::IRBuilderBase &builder,
-                                            llvm::Intrinsic::ID intrinsic,
-                                            ArrayRef<llvm::Value *> args) {
-  llvm::Module *module = builder.GetInsertBlock()->getModule();
-  llvm::Function *fn;
-  if (llvm::Intrinsic::isOverloaded(intrinsic)) {
-    if (intrinsic != llvm::Intrinsic::nvvm_wmma_m16n16k16_mma_row_row_f16_f16 &&
-        intrinsic != llvm::Intrinsic::nvvm_wmma_m16n16k16_mma_row_row_f32_f32) {
-      // NVVM load and store instrinsic names are overloaded on the
-      // source/destination pointer type. Pointer is the first argument in the
-      // corresponding NVVM Op.
-      fn = llvm::Intrinsic::getDeclaration(module, intrinsic,
-                                           {args[0]->getType()});
-    } else {
-      fn = llvm::Intrinsic::getDeclaration(module, intrinsic, {});
-    }
-  } else {
-    fn = llvm::Intrinsic::getDeclaration(module, intrinsic);
-  }
-  return builder.CreateCall(fn, args);
-}
-
 /// Given a single MLIR operation, create the corresponding LLVM IR operation
 /// using the `builder`.
 LogicalResult
diff --git a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
index 9dd853a39b423..4c035acaf7383 100644
--- a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
@@ -22,7 +22,8 @@ gpu.module @test_module {
     // CHECK:  %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJO]]] : (!llvm.ptr<f16, 3>, i64) -> !llvm.ptr<f16, 3>
     // CHECK:  %[[CADDRESS:.*]] = llvm.bitcast %[[ADDRESS]] : !llvm.ptr<f16, 3> to !llvm.ptr<i32, 3>
     // CHECK:  %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
-    // CHECK:  %[[FRAG:.*]] = nvvm.wmma.m16n16k16.load.a.f16.row.stride %[[CADDRESS]], %[[LDM32]] : (!llvm.ptr<i32, 3>, i32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+    // CHECK:  %[[FRAG:.*]] = nvvm.wmma.load %[[CADDRESS]], %[[LDM32]]
+    // CHECK-SAME: {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}  : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 
     // CHECK32:  %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
@@ -36,7 +37,8 @@ gpu.module @test_module {
     // CHECK32:  %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJO]]] : (!llvm.ptr<f16, 3>, i32) -> !llvm.ptr<f16, 3>
     // CHECK32:  %[[CADDRESS:.*]] = llvm.bitcast %[[ADDRESS]] : !llvm.ptr<f16, 3> to !llvm.ptr<i32, 3>
     // CHECK32:  %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
-    // CHECK32:  %[[FRAG:.*]] = nvvm.wmma.m16n16k16.load.a.f16.row.stride %[[CADDRESS]], %[[LDM32]] : (!llvm.ptr<i32, 3>, i32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+    // CHECK32:  %[[FRAG:.*]] = nvvm.wmma.load %[[CADDRESS]], %[[LDM32]]
+    // CHECK32-SAME: {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}  : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK32:  llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     return %0 : !gpu.mma_matrix<16x16xf16, "AOp">
   }
@@ -70,7 +72,8 @@ gpu.module @test_module {
     // CHECK:  %[[EL3:.*]] = llvm.extractvalue %[[D]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  %[[EL4:.*]] = llvm.extractvalue %[[D]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
-    // CHECK:  nvvm.wmma.m16n16k16.store.d.f16.row.stride %[[CADDRESS]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]], %[[LDM32]] : !llvm.ptr<i32, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, i32
+    // CHECK:  nvvm.wmma.store %[[CADDRESS]], %[[LDM32]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]]
+    // CHECK-SAME: {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : !llvm.ptr<i32, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
     // CHECK:  llvm.return
 
     // CHECK32:  %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
@@ -88,7 +91,8 @@ gpu.module @test_module {
     // CHECK32:  %[[EL3:.*]] = llvm.extractvalue %[[D]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK32:  %[[EL4:.*]] = llvm.extractvalue %[[D]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK32:  %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
-    // CHECK32:  nvvm.wmma.m16n16k16.store.d.f16.row.stride %[[CADDRESS]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]], %[[LDM32]] : !llvm.ptr<i32, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, i32
+    // CHECK32:  nvvm.wmma.store %[[CADDRESS]],  %[[LDM32]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]]
+    // CHECK32-SAME: {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : !llvm.ptr<i32, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
     // CHECK32:  llvm.return
     return
   }
@@ -122,7 +126,9 @@ gpu.module @test_module {
     // CHECK:  %[[C2:.*]] = llvm.extractvalue %[[C]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  %[[C3:.*]] = llvm.extractvalue %[[C]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  %[[C4:.*]] = llvm.extractvalue %[[C]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-    // CHECK:  %[[RES:.*]] = nvvm.wmma.m16n16k16.mma.row.row.f16.f16 %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]], %[[A6]], %[[A7]], %[[A8]], %[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B5]], %[[B6]], %[[B7]], %[[B8]], %[[C1]], %[[C2]], %[[C3]], %[[C4]] : vector<2xf16> -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+    // CHECK:  %[[RES:.*]] = nvvm.wmma.mma %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]], %[[A6]], %[[A7]], %[[A8]], %[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B5]], %[[B6]], %[[B7]], %[[B8]], %[[C1]], %[[C2]], %[[C3]], %[[C4]]
+    // CHECK-SAME: {eltypeA = "f16", eltypeB = "f16", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32} : (
+    // CHECK-SAME: vector<2xf16>, {{.*}}) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  llvm.return %[[RES]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     return %D : !gpu.mma_matrix<16x16xf16, "COp">
   }
@@ -133,13 +139,13 @@ gpu.module @test_module {
 gpu.module @test_module {
 
 // CHECK-LABEL: func @gpu_wmma_mma_loop_op
-//       CHECK:   %[[C:.+]] = nvvm.wmma.m16n16k16.load.c.f16.row.stride %{{.*}}, %{{.*}} : (!llvm.ptr<i32>, i32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+//       CHECK:   %[[C:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = "f16", frag = "c", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<i32>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   llvm.br ^bb1(%{{.*}}, %[[C]] : i64, !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)
 //       CHECK:  ^bb1(%{{.*}}: i64, %[[ACC:.+]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>):  // 2 preds: ^bb0, ^bb2
 //       CHECK:   llvm.cond_br %{{.*}}, ^bb2, ^bb3
 //       CHECK:  ^bb2:  // pred: ^bb1
-//       CHECK:   %[[A:.+]] = nvvm.wmma.m16n16k16.load.a.f16.row.stride %{{.*}}, %{{.*}} : (!llvm.ptr<i32>, i32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-//       CHECK:   %[[B:.+]] = nvvm.wmma.m16n16k16.load.b.f16.row.stride %{{.*}}, %{{.*}} : (!llvm.ptr<i32>, i32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+//       CHECK:   %[[A:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<i32>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+//       CHECK:   %[[B:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = "f16", frag = "b", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<i32>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[A0:.+]] = llvm.extractvalue %[[A]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[A1:.+]] = llvm.extractvalue %[[A]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[A2:.+]] = llvm.extractvalue %[[A]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
@@ -160,14 +166,14 @@ gpu.module @test_module {
 //       CHECK:   %[[ACC1:.+]] = llvm.extractvalue %[[ACC]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[ACC2:.+]] = llvm.extractvalue %[[ACC]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[ACC3:.+]] = llvm.extractvalue %[[ACC]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-//       CHECK:   %[[ACC_MUL:.+]] = nvvm.wmma.m16n16k16.mma.row.row.f16.f16 %[[A0]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]], %[[A6]], %[[A7]], %[[B0]], %[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B5]], %[[B6]], %[[B7]], %[[ACC0]], %[[ACC1]], %[[ACC2]], %[[ACC3]] : vector<2xf16> -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+//       CHECK:   %[[ACC_MUL:.+]] = nvvm.wmma.mma %[[A0]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]], %[[A6]], %[[A7]], %[[B0]], %[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B5]], %[[B6]], %[[B7]], %[[ACC0]], %[[ACC1]], %[[ACC2]], %[[ACC3]] {eltypeA = "f16", eltypeB = "f16", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32} : (vector<2xf16>, {{.*}} -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   llvm.br ^bb1(%{{.*}}, %[[ACC_MUL]] : i64, !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)
 //       CHECK:  ^bb3:  // pred: ^bb1
 //       CHECK:   %[[E0:.+]] = llvm.extractvalue %[[ACC]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[E1:.+]] = llvm.extractvalue %[[ACC]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[E2:.+]] = llvm.extractvalue %[[ACC]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 //       CHECK:   %[[E3:.+]] = llvm.extractvalue %[[ACC]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-//       CHECK:   nvvm.wmma.m16n16k16.store.d.f16.row.stride %{{.*}}, %[[E0]], %[[E1]], %[[E2]], %[[E3]], %{{.*}} : !llvm.ptr<i32>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, i32
+//       CHECK:   nvvm.wmma.store %{{.*}}, %{{.*}}, %[[E0]], %[[E1]], %[[E2]], %[[E3]] {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : !llvm.ptr<i32>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
 
   builtin.func @gpu_wmma_mma_loop_op(%arg0: memref<128x128xf16>, %arg1: memref<128x128xf16>, %arg2: memref<128x128xf16>) {
       %c0 = arith.constant 0 : index
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index 1caa3f415ee20..3f07f173ec875 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -1040,44 +1040,30 @@ module {
 // -----
 
 llvm.func @wmmaLoadOp_invalid_mem_space(%arg0: !llvm.ptr<i32, 5>, %arg1: i32) {
-  // expected-error@+1 {{'nvvm.wmma.m16n16k16.load.a.f16.row.stride' op expected operands to be a source pointer in memory space 0, 1, 3 followed by ldm of the source}}
-  %0 = nvvm.wmma.m16n16k16.load.a.f16.row.stride %arg0, %arg1 : (!llvm.ptr<i32, 5>, i32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-
-  llvm.return
-}
-
-// -----
-
-llvm.func @wmmaLoadOp_invalid_missing_ldm(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
-  // expected-error@+1 {{'nvvm.wmma.m16n16k16.load.a.f16.row.stride' op expected operands to be a source pointer in memory space 0, 1, 3 followed by ldm of the source}}
-  %0 = nvvm.wmma.m16n16k16.load.a.f16.row.stride %arg0: (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-
-  llvm.return
-}
-
-// -----
-
-llvm.func @wmmaLoadOp_invalid_AOp(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
-  // expected-error@+1 {{'nvvm.wmma.m16n16k16.load.a.f16.row.stride' op expected result type of loadAOp and loadBOp to be a struct of 8 <halfx2>s}}
-  %0 = nvvm.wmma.m16n16k16.load.a.f16.row.stride %arg0, %arg1 : (!llvm.ptr<i32, 3>, i32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-
+  // expected-error@+1 {{'nvvm.wmma.load' op expected source pointer in memory space 0, 1, 3}}
+  %0 = nvvm.wmma.load %arg0, %arg1
+    {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+    : (!llvm.ptr<i32, 5>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
   llvm.return
 }
 
 // -----
 
 llvm.func @wmmaLoadOp_invalid_AOp(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
-  // expected-error@+1 {{nvvm.wmma.m16n16k16.load.a.f16.row.stride' op expected result type of loadAOp and loadBOp to be a struct of 8 <halfx2>s}}
-  %0 = nvvm.wmma.m16n16k16.load.a.f16.row.stride %arg0, %arg1 : (!llvm.ptr<i32, 3>, i32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-
+  // expected-error@+1 {{'nvvm.wmma.load' op expected destination type is a structure of 8 elements of type 'vector<2xf16>'}}
+  %0 = nvvm.wmma.load %arg0, %arg1
+  {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+  : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
   llvm.return
 }
 
 // -----
 
 llvm.func @wmmaLoadOp_invalid_BOp(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
-  // expected-error@+1 {{'nvvm.wmma.m16n16k16.load.b.f16.row.stride' op expected result type of loadAOp and loadBOp to be a struct of 8 <halfx2>s}}
-  %0 = nvvm.wmma.m16n16k16.load.b.f16.row.stride %arg0, %arg1 : (!llvm.ptr<i32, 3>, i32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+  // expected-error@+1 {{'nvvm.wmma.load' op expected destination type is a structure of 8 elements of type 'vector<2xf16>'}}
+ %0 = nvvm.wmma.load %arg0, %arg1
+ {eltype = "f16", frag = "b", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+ : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 
   llvm.return
 }
@@ -1085,29 +1071,23 @@ llvm.func @wmmaLoadOp_invalid_BOp(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
 // -----
 
 llvm.func @wmmaLoadOp_invalid_COp(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
-  // expected-error@+1 {{'nvvm.wmma.m16n16k16.load.c.f16.row.stride' op expected result type of loadCOp to be a struct of 4 <halfx2>s or 8 f32s}}
-  %0 = nvvm.wmma.m16n16k16.load.c.f16.row.stride %arg0, %arg1 : (!llvm.ptr<i32, 3>, i32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
+  // expected-error@+1 {{'nvvm.wmma.load' op expected destination type is a structure of 4 elements of type 'vector<2xf16>'}}
+ %0 = nvvm.wmma.load %arg0, %arg1
+   {eltype = "f16", frag = "c", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+   : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
 
   llvm.return
 }
 
 // -----
 
-llvm.func @wmmaStoreOp_invalid_mem_space(%arg0: !llvm.ptr<i32, 5>, %arg1: vector<2 x f16>,
+llvm.func @wmmaStoreOp_invalid_mem_space(%arg0: !llvm.ptr<i32, 5>, %arg1: i32,
                             %arg2: vector<2 x f16>, %arg3: vector<2 x f16>,
-                            %arg4: vector<2 xf16>, %arg5: i32) {
-  // expected-error@+1 {{'nvvm.wmma.m16n16k16.store.d.f16.row.stride' op expected operands to be a source pointer in memoryspace 0, 1, 3 followed by ldm of the source}}
-  nvvm.wmma.m16n16k16.store.d.f16.row.stride %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 : !llvm.ptr<i32, 5>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, i32
-  llvm.return
-}
-
-// -----
-
-llvm.func @wmmaStoreOp_invalid_missing_ldm(%arg0: !llvm.ptr<i32, 3>, %arg1: vector<2 x f16>,
-                            %arg2: vector<2 x f16>, %arg3: vector<2 x f16>,
-                            %arg4: vector<2 xf16>, %arg5: i32) {
-  // expected-error@+1 {{'nvvm.wmma.m16n16k16.store.d.f16.row.stride' op expected operands to be a source pointer in memoryspace 0, 1, 3 followed by ldm of the source}}
-  nvvm.wmma.m16n16k16.store.d.f16.row.stride %arg0, %arg1, %arg2, %arg3, %arg4 : !llvm.ptr<i32, 3>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>
+                            %arg4: vector<2 x f16>, %arg5: vector<2 xf16>) {
+  // expected-error@+1 {{'nvvm.wmma.store' op expected operands to be a source pointer in memory space 0, 1, 3}}
+  nvvm.wmma.store %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
+    {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+    : !llvm.ptr<i32, 5>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>
   llvm.return
 }
 
@@ -1123,8 +1103,12 @@ llvm.func @gpu_wmma_mma_op_invalid_operands(%arg0: vector<2 x f16>, %arg1: vecto
                         %arg14: vector<2 x f16>, %arg15: vector<2 x f16>,
                         %arg16: vector<2 x f16>, %arg17: vector<2 x f16>,
                         %arg18: vector<2 x f16>) {
-  // expected-error@+1 {{'nvvm.wmma.m16n16k16.mma.row.row.f16.f16' op expected 20 <halfx2>s as operands}}
-  %0 = nvvm.wmma.m16n16k16.mma.row.row.f16.f16 %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18 : vector<2 x f16> -> !llvm.struct<(vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)>
+  // expected-error@+1 {{'nvvm.wmma.mma' op expected 20 arguments}}
+  %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18
+    {eltypeA = "f16", eltypeB = "f16", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    : (vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
+       vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)
+      -> !llvm.struct<(vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)>
   llvm.return
 }
 
@@ -1140,9 +1124,12 @@ llvm.func @gpu_wmma_mma_op_results(%arg0: vector<2 x f16>, %arg1: vector<2 x f16
                         %arg14: vector<2 x f16>, %arg15: vector<2 x f16>,
                         %arg16: vector<2 x f16>, %arg17: vector<2 x f16>,
                         %arg18: vector<2 x f16>, %arg19: vector<2 x f16>) {
-  // expected-error@+1 {{expected result type to be a struct of 4 <halfx2>s}}
-  %0 = nvvm.wmma.m16n16k16.mma.row.row.f16.f16 %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19 : vector<2 x f16> -> !llvm.struct<(vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)>
-  llvm.return
+  // expected-error@+1 {{'nvvm.wmma.mma' op expected destination type is a structure of 4 elements of type 'vector<2xf16>'}}
+  %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19
+    {eltypeA = "f16", eltypeB = "f16", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    : (vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
+       vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)
+      -> !llvm.struct<(vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)>  llvm.return
 }
 
 // -----
@@ -1157,8 +1144,10 @@ llvm.func @gpu_wmma_mma_op_invalid_ab_operands(%arg0: vector<2 x f16>, %arg1: ve
                         %arg14: vector<2 x f16>, %arg15: f32,
                         %arg16: f32, %arg17: f32, %arg18: f32, %arg19: f32,
                         %arg20: f32, %arg21: f32, %arg22: f32, %arg23: f32) {
-  // expected-error@+1 {{'nvvm.wmma.m16n16k16.mma.row.row.f32.f32' op expected 16 <halfx2>s for `a` and `b` operand}}
-  %0 = nvvm.wmma.m16n16k16.mma.row.row.f32.f32 %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23 : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
+  // expected-error@+1 {{'nvvm.wmma.mma' op expected argument 15 to be of type 'vector<2xf16>'}}
+  %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23
+    {eltypeA = "f16", eltypeB = "f32", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
   llvm.return
 }
 
@@ -1174,8 +1163,10 @@ llvm.func @gpu_wmma_mma_op_invalid_c_operand(%arg0: vector<2 x f16>, %arg1: vect
                         %arg14: vector<2 x f16>, %arg15: vector<2xf16>,
                         %arg16: f32, %arg17: f32, %arg18: f32, %arg19: f32,
                         %arg20: f32, %arg21: f32, %arg22: f32, %arg23: vector<2xf16>) {
-  // expected-error@+1 {{'nvvm.wmma.m16n16k16.mma.row.row.f32.f32' op expected 8 f32s for `c` operand}}
-  %0 = nvvm.wmma.m16n16k16.mma.row.row.f32.f32 %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23 : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, vector<2xf16>) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
+  // expected-error@+1 {{'nvvm.wmma.mma' op expected argument 23 to be of type 'f32'}}
+  %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23
+    {eltypeA = "f16", eltypeB = "f32", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, vector<2xf16>) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
   llvm.return
 }
 
@@ -1191,8 +1182,10 @@ llvm.func @gpu_wmma_mma_op_invalid_result(%arg0: vector<2 x f16>, %arg1: vector<
                         %arg14: vector<2 x f16>, %arg15: vector<2xf16>,
                         %arg16: f32, %arg17: f32, %arg18: f32, %arg19: f32,
                         %arg20: f32, %arg21: f32, %arg22: f32, %arg23: f32) {
-  // expected-error@+1 {{'nvvm.wmma.m16n16k16.mma.row.row.f32.f32' op expected result type to be a struct of 8 f32s}}
-  %0 = nvvm.wmma.m16n16k16.mma.row.row.f32.f32 %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23 : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, vector<2xf16>)>
+  // expected-error@+1 {{'nvvm.wmma.mma' op expected destination type is a structure of 8 elements of type 'f32'}}
+  %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23
+    {eltypeA = "f16", eltypeB = "f32", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, vector<2xf16>)>
   llvm.return
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index 1e3d6dc739785..638a5ab47dd0d 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -69,6 +69,27 @@ func @nvvm_mma(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
   llvm.return %0 : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
 }
 
+func @nvvm_wmma_load_tf32(%arg0: !llvm.ptr<i32>, %arg1 : i32) -> !llvm.struct<(i32, i32, i32, i32)> {
+  // CHECK: nvvm.wmma.load {{.*}} {eltype = "tf32", frag = "a", k = 8 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+  %0 = nvvm.wmma.load %arg0, %arg1
+    {eltype = "tf32", frag = "a", k = 8 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+    : (!llvm.ptr<i32>) -> !llvm.struct<(i32, i32, i32, i32)>
+  llvm.return %0 : !llvm.struct<(i32, i32, i32, i32)>
+}
+
+func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 : i32,
+                    %6 : i32, %7 : i32, %8 : f32, %9 : f32, %10 : f32,
+                    %11 : f32, %12 : f32, %13 : f32, %14 : f32, %15 : f32)
+                   -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> {
+  // CHECK: nvvm.wmma.mma {{.*}} {eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+  %r = nvvm.wmma.mma %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15
+    {eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    : (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)
+    -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
+  llvm.return %r : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
+}
+
+
 // -----
 
 // expected-error@below {{attribute attached to unexpected op}}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 8fefd7866fd5d..08859b22c0d83 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -77,18 +77,22 @@ llvm.func @nvvm_mma(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
 // in the LLVM NVPTX backend.
 llvm.func @gpu_wmma_load_op(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
   // CHECK: call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p3i32(i32 addrspace(3)* %{{.*}}, i32 %{{.*}})
-  %0 = nvvm.wmma.m16n16k16.load.a.f16.row.stride %arg0, %arg1 : (!llvm.ptr<i32, 3>, i32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+  %0 = nvvm.wmma.load %arg0, %arg1
+    {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+    : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 
   llvm.return
 }
 
 // The test below checks the correct mapping of the nvvm.wmma.*.store.* op to the correct intrinsic
 // in the LLVM NVPTX backend.
-llvm.func @gpu_wmma_store_op(%arg0: !llvm.ptr<i32, 3>, %arg1: vector<2 x f16>,
+llvm.func @gpu_wmma_store_op(%arg0: !llvm.ptr<i32, 3>, %arg1: i32,
                             %arg2: vector<2 x f16>, %arg3: vector<2 x f16>,
-                            %arg4: vector<2 xf16>, %arg5: i32) {
+                            %arg4: vector<2 xf16>, %arg5: vector<2 x f16>) {
   // CHECK: call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p3i32(i32 addrspace(3)* %{{.*}}, <2 x half> {{.*}}, <2 x half> %{{.*}}, <2 x half> %{{.*}}, <2 x half> %{{.*}}, i32 %{{.*}})
-  nvvm.wmma.m16n16k16.store.d.f16.row.stride %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 : !llvm.ptr<i32, 3>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, i32
+  nvvm.wmma.store %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
+    {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+    : !llvm.ptr<i32, 3>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>
   llvm.return
 }
 
@@ -105,8 +109,32 @@ llvm.func @gpu_wmma_mma_op(%arg0: vector<2 x f16>, %arg1: vector<2 x f16>,
                         %arg16: vector<2 x f16>, %arg17: vector<2 x f16>,
                         %arg18: vector<2 x f16>, %arg19: vector<2 x f16>) {
   // CHECK: call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-  %0 = nvvm.wmma.m16n16k16.mma.row.row.f16.f16 %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19 : vector<2 x f16> -> !llvm.struct<(vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)>
+  %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19
+    {eltypeA = "f16", eltypeB = "f16", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    : (vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
+       vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
+       vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
+       vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)
+      -> !llvm.struct<(vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)>
+  llvm.return
+}
+
+llvm.func @nvvm_wmma_load_tf32(%arg0: !llvm.ptr<i32>, %arg1 : i32) {
+  // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0i32(i32* %{{.*}}, i32 %{{.*}})
+  %0 = nvvm.wmma.load %arg0, %arg1
+    {eltype = "tf32", frag = "a", k = 8 : i32, layout = "row", m = 16 : i32, n = 16 : i32}
+    : (!llvm.ptr<i32>) -> !llvm.struct<(i32, i32, i32, i32)>
+  llvm.return
+}
 
+llvm.func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 : i32,
+                    %6 : i32, %7 : i32, %8 : f32, %9 : f32, %10 : f32,
+                    %11 : f32, %12 : f32, %13 : f32, %14 : f32, %15 : f32) {
+  // CHECK: { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
+  %r = nvvm.wmma.mma %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15
+    {eltypeA = "tf32", eltypeB = "f32", k = 8 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32}
+    : (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)
+    -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
   llvm.return
 }
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 39c258ade3fe2..cd1e34d1964d2 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3295,6 +3295,14 @@ gentbl_cc_library(
             ],
             "include/mlir/Dialect/LLVMIR/NVVMOpsDialect.cpp.inc",
         ),
+        (
+            ["-gen-enum-decls"],
+            "include/mlir/Dialect/LLVMIR/NVVMOpsEnums.h.inc",
+        ),
+        (
+            ["-gen-enum-defs"],
+            "include/mlir/Dialect/LLVMIR/NVVMOpsEnums.cpp.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/LLVMIR/NVVMOps.td",