diff --git a/llvm/include/llvm/MC/MCDecoder.h b/llvm/include/llvm/MC/MCDecoder.h
index 70762a4a5ebae..459c8a6a5ea34 100644
--- a/llvm/include/llvm/MC/MCDecoder.h
+++ b/llvm/include/llvm/MC/MCDecoder.h
@@ -12,6 +12,7 @@
 
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/Support/MathExtras.h"
+#include <bitset>
 #include <cassert>
 
 namespace llvm::MCD {
@@ -48,6 +49,15 @@ fieldFromInstruction(const InsnType &Insn, unsigned StartBit,
   return Insn.extractBitsAsZExtValue(NumBits, StartBit);
 }
 
+template <size_t N>
+uint64_t fieldFromInstruction(const std::bitset<N> &Insn, unsigned StartBit,
+                              unsigned NumBits) {
+  assert(StartBit + NumBits <= N && "Instruction field out of bounds!");
+  assert(NumBits <= 64 && "Cannot support >64-bit extractions!");
+  const std::bitset<N> Mask(maskTrailingOnes<uint64_t>(NumBits));
+  return ((Insn >> StartBit) & Mask).to_ullong();
+}
+
 // Helper function for inserting bits extracted from an encoded instruction into
 // an integer-typed field.
 template <typename IntType>
@@ -62,6 +72,13 @@ insertBits(IntType &field, IntType bits, unsigned startBit, unsigned numBits) {
   field |= bits << startBit;
 }
 
+// InsnBitWidth is essentially a type trait used by the decoder emitter to query
+// the supported bitwidth for a given type. But default, the value is 0, making
+// it an invalid type for use as `InsnType` when instantiating the decoder.
+// Individual targets are expected to provide specializations for these based
+// on their usage.
+template <typename T> static constexpr uint32_t InsnBitWidth = 0;
+
 } // namespace llvm::MCD
 
 #endif // LLVM_MC_MCDECODER_H
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 619ff4e5c73c4..05295ae73be23 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -6,7 +6,8 @@ tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler
+              --specialize-decoders-per-bitwidth)
 tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 6a2beeed41dfd..80d194afa926b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/Compiler.h"
 
 using namespace llvm;
+using namespace llvm::MCD;
 
 #define DEBUG_TYPE "amdgpu-disassembler"
 
@@ -446,6 +447,14 @@ static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm,
 
 #include "AMDGPUGenDisassemblerTables.inc"
 
+// Define bitwidths for various types used to instantiate the decoder.
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 64;
+template <>
+static constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<96>> = 96;
+template <>
+static constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<128>> = 128;
+
 //===----------------------------------------------------------------------===//
 //
 //===----------------------------------------------------------------------===//
@@ -498,26 +507,24 @@ template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
   return Res;
 }
 
-static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+static inline std::bitset<96> eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+  using namespace llvm::support::endian;
   assert(Bytes.size() >= 12);
-  uint64_t Lo =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<96> Lo(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  uint64_t Hi =
-      support::endian::read<uint32_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<96> Hi(read<uint32_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(4);
-  return DecoderUInt128(Lo, Hi);
+  return (Hi << 64) | Lo;
 }
 
-static inline DecoderUInt128 eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+static inline std::bitset<128> eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+  using namespace llvm::support::endian;
   assert(Bytes.size() >= 16);
-  uint64_t Lo =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<128> Lo(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  uint64_t Hi =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<128> Hi(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  return DecoderUInt128(Lo, Hi);
+  return (Hi << 64) | Lo;
 }
 
 void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
@@ -600,14 +607,14 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
     // encodings
     if (isGFX1250() && Bytes.size() >= 16) {
-      DecoderUInt128 DecW = eat16Bytes(Bytes);
+      std::bitset<128> DecW = eat16Bytes(Bytes);
       if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS))
         break;
       Bytes = Bytes_.slice(0, MaxInstBytesNum);
     }
 
     if (isGFX11Plus() && Bytes.size() >= 12) {
-      DecoderUInt128 DecW = eat12Bytes(Bytes);
+      std::bitset<96> DecW = eat12Bytes(Bytes);
 
       if (isGFX11() &&
           tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
@@ -642,7 +649,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
     } else if (Bytes.size() >= 16 &&
                STI.hasFeature(AMDGPU::FeatureGFX950Insts)) {
-      DecoderUInt128 DecW = eat16Bytes(Bytes);
+      std::bitset<128> DecW = eat16Bytes(Bytes);
       if (tryDecodeInst(DecoderTableGFX940128, MI, DecW, Address, CS))
         break;
 
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index f4d164bf10c3c..ded447b6f8d5a 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -32,44 +32,6 @@ class MCOperand;
 class MCSubtargetInfo;
 class Twine;
 
-// Exposes an interface expected by autogenerated code in
-// FixedLenDecoderEmitter
-class DecoderUInt128 {
-private:
-  uint64_t Lo = 0;
-  uint64_t Hi = 0;
-
-public:
-  DecoderUInt128() = default;
-  DecoderUInt128(uint64_t Lo, uint64_t Hi = 0) : Lo(Lo), Hi(Hi) {}
-  operator bool() const { return Lo || Hi; }
-  uint64_t extractBitsAsZExtValue(unsigned NumBits,
-                                  unsigned BitPosition) const {
-    assert(NumBits && NumBits <= 64);
-    assert(BitPosition < 128);
-    uint64_t Val;
-    if (BitPosition < 64)
-      Val = Lo >> BitPosition | Hi << 1 << (63 - BitPosition);
-    else
-      Val = Hi >> (BitPosition - 64);
-    return Val & ((uint64_t(2) << (NumBits - 1)) - 1);
-  }
-  DecoderUInt128 operator&(const DecoderUInt128 &RHS) const {
-    return DecoderUInt128(Lo & RHS.Lo, Hi & RHS.Hi);
-  }
-  DecoderUInt128 operator&(const uint64_t &RHS) const {
-    return *this & DecoderUInt128(RHS);
-  }
-  DecoderUInt128 operator~() const { return DecoderUInt128(~Lo, ~Hi); }
-  bool operator==(const DecoderUInt128 &RHS) {
-    return Lo == RHS.Lo && Hi == RHS.Hi;
-  }
-  bool operator!=(const DecoderUInt128 &RHS) {
-    return Lo != RHS.Lo || Hi != RHS.Hi;
-  }
-  bool operator!=(const int &RHS) { return *this != DecoderUInt128(RHS); }
-};
-
 //===----------------------------------------------------------------------===//
 // AMDGPUDisassembler
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 47329b2c2f4d2..531238ae85029 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -7,7 +7,8 @@ tablegen(LLVM RISCVGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM RISCVGenCompressInstEmitter.inc -gen-compress-inst-emitter)
 tablegen(LLVM RISCVGenMacroFusion.inc -gen-macro-fusion-pred)
 tablegen(LLVM RISCVGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler
+              --specialize-decoders-per-bitwidth)
 tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM RISCVGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM RISCVGenMCPseudoLowering.inc -gen-pseudo-lowering)
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index de1bdb4a8811c..c8b89f5192c3d 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -558,7 +558,7 @@ static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
   return decodeZcmpRlist(Inst, Imm, Address, Decoder);
 }
 
-static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
+static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint16_t Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
   uint32_t Rs1 = fieldFromInstruction(Insn, 7, 5);
@@ -701,6 +701,12 @@ static constexpr DecoderListEntry DecoderList32[]{
     {DecoderTableZdinxRV32Only32, {}, "RV32-only Zdinx (Double in Integer)"},
 };
 
+// Define bitwidths for various types used to instantiate the decoder.
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint16_t> = 16;
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
+// Use uint64_t to represent 48 bit instructions.
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 48;
+
 DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
                                                  ArrayRef<uint8_t> Bytes,
                                                  uint64_t Address,
@@ -711,9 +717,7 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
   }
   Size = 4;
 
-  // Use uint64_t to match getInstruction48. decodeInstruction is templated
-  // on the Insn type.
-  uint64_t Insn = support::endian::read32le(Bytes.data());
+  uint32_t Insn = support::endian::read32le(Bytes.data());
 
   for (const DecoderListEntry &Entry : DecoderList32) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
@@ -759,9 +763,7 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
   }
   Size = 2;
 
-  // Use uint64_t to match getInstruction48. decodeInstruction is templated
-  // on the Insn type.
-  uint64_t Insn = support::endian::read16le(Bytes.data());
+  uint16_t Insn = support::endian::read16le(Bytes.data());
 
   for (const DecoderListEntry &Entry : DecoderList16) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
diff --git a/llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td b/llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td
new file mode 100644
index 0000000000000..b4142e983ef77
--- /dev/null
+++ b/llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td
@@ -0,0 +1,175 @@
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-DEFAULT
+// RUN: llvm-tblgen -gen-disassembler -specialize-decoders-per-bitwidth -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-SPECIALIZE-NO-TABLE
+// RUN: llvm-tblgen -gen-disassembler -specialize-decoders-per-bitwidth -use-fn-table-in-decode-to-mcinst -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-SPECIALIZE-TABLE
+
+
+include "llvm/Target/Target.td"
+
+def archInstrInfo : InstrInfo { }
+
+def arch : Target {
+  let InstructionSet = archInstrInfo;
+}
+
+let Namespace = "arch" in {
+  def R0 : Register<"r0">;
+  def R1 : Register<"r1">;
+  def R2 : Register<"r2">;
+  def R3 : Register<"r3">;
+}
+def Regs : RegisterClass<"Regs", [i32], 32, (add R0, R1, R2, R3)>;
+
+// Bit 0 of the encoding determines the size (8 or 16 bits).
+// Bits {3..1} define the number of operands encoded.
+class Instruction8Bit<int NumOps> : Instruction {
+  let Size = 1;
+  let OutOperandList = (outs);
+  field bits<8> Inst;
+  let Inst{0} = 0;
+  let Inst{3-1} = NumOps;
+}
+
+class Instruction16Bit<int NumOps> : Instruction {
+  let Size = 2;
+  let OutOperandList = (outs);
+  field bits<16> Inst;
+  let Inst{0} = 1;
+  let Inst{3-1} = NumOps;
+}
+
+// Define instructions to generate 4 cases in decodeToMCInst.
+// Each register operand needs 2 bits to encode.
+
+// An instruction with no inputs.
+def Inst0 : Instruction8Bit<0> {
+  let Inst{7-4} = 0;
+  let InOperandList = (ins);
+  let AsmString = "Inst0";
+}
+
+// An instruction with a single input.
+def Inst1 : Instruction8Bit<1> {
+  bits<2> r0;
+  let Inst{5-4} = r0;
+  let Inst{7-6} = 0;
+  let InOperandList = (ins Regs:$r0);
+  let AsmString = "Inst1";
+}
+
+// An instruction with two inputs.
+def Inst2 : Instruction16Bit<2> {
+  bits<2> r0;
+  bits<2> r1;
+  let Inst{5-4} = r0;
+  let Inst{7-6} = r1;
+  let Inst{15-8} = 0;
+  let InOperandList = (ins Regs:$r0, Regs:$r1);
+  let AsmString = "Inst2";
+}
+
+// An instruction with three inputs. .
+def Inst3 : Instruction16Bit<3> {
+  bits<2> r0;
+  bits<2> r1;
+  bits<2> r2;
+  let Inst{5-4} = r0;
+  let Inst{7-6} = r1;
+  let Inst{9-8} = r2;
+  let Inst{15-10} = 0;
+  let InOperandList = (ins Regs:$r0, Regs:$r1, Regs:$r2);
+  let AsmString = "Inst3";
+}
+
+// -----------------------------------------------------------------------------
+// In the default case, we emit a single decodeToMCinst function and DecodeIdx
+// is shared across all bitwidths.
+
+// CHECK-DEFAULT-LABEL: DecoderTable8[25]
+// CHECK-DEFAULT:         DecodeIdx: 0
+// CHECK-DEFAULT:         DecodeIdx: 1
+// CHECK-DEFAULT:       };
+
+// CHECK-DEFAULT-LABEL: DecoderTable16[25]
+// CHECK-DEFAULT:         DecodeIdx: 2
+// CHECK-DEFAULT:         DecodeIdx: 3
+// CHECK-DEFAULT:       };
+
+// CHECK-DEFAULT-LABEL: template <typename InsnType>
+// CHECK-DEFAULT-NEXT:  static DecodeStatus decodeToMCInst
+// CHECK-DEFAULT:         case 0
+// CHECK-DEFAULT:         case 1
+// CHECK-DEFAULT:         case 2
+// CHECK-DEFAULT:         case 3
+
+// -----------------------------------------------------------------------------
+// When we specialize per bitwidth, we emit 2 decodeToMCInst functions and
+// DecodeIdx is assigned per bit width.
+
+// CHECK-SPECIALIZE-NO-TABLE-LABEL:   DecoderTable8[25]
+// CHECK-SPECIALIZE-NO-TABLE:           DecodeIdx: 0
+// CHECK-SPECIALIZE-NO-TABLE:           DecodeIdx: 1
+// CHECK-SPECIALIZE-NO-TABLE:         };
+
+// CHECK-SPECIALIZE-NO-TABLE-LABEL:   template <typename InsnType>
+// CHECK-SPECIALIZE-NO-TABLE-NEXT:    static std::enable_if_t<InsnBitWidth<InsnType> == 8, DecodeStatus>
+// CHECK-SPECIALIZE-NO-TABLE-NEXT:    decodeToMCInst
+// CHECK-SPECIALIZE-NO-TABLE:           case 0
+// CHECK-SPECIALIZE-NO-TABLE:           case 1
+
+// CHECK-SPECIALIZE-NO-TABLE-LABEL:   DecoderTable16[25]
+// CHECK-SPECIALIZE-NO-TABLE:           DecodeIdx: 0
+// CHECK-SPECIALIZE-NO-TABLE:           DecodeIdx: 1
+// CHECK-SPECIALIZE-NO-TABLE:         };
+
+// CHECK-SPECIALIZE-NO-TABLE-LABEL:   template <typename InsnType>
+// CHECK-SPECIALIZE-NO-TABLE-NEXT:    static std::enable_if_t<InsnBitWidth<InsnType> == 16, DecodeStatus>
+// CHECK-SPECIALIZE-NO-TABLE-NEXT:    decodeToMCInst
+// CHECK-SPECIALIZE-NO-TABLE:           case 0
+// CHECK-SPECIALIZE-NO-TABLE:           case 1
+
+// -----------------------------------------------------------------------------
+// Per bitwidth specialization with function table.
+
+// 8 bit deccoder table, functions, and function table.
+// CHECK-SPECIALIZE-TABLE-LABEL:    DecoderTable8[25]
+// CHECK-SPECIALIZE-TABLE:            DecodeIdx: 0
+// CHECK-SPECIALIZE-TABLE:            DecodeIdx: 1
+// CHECK-SPECIALIZE-TABLE:          };
+
+// CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
+// CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 8, DecodeStatus>
+// CHECK-SPECIALIZE-TABLE-NEXT:     decodeFn_8bit_0
+
+// CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
+// CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 8, DecodeStatus>
+// CHECK-SPECIALIZE-TABLE-NEXT:     decodeFn_8bit_1
+
+// CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
+// CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 8, DecodeStatus>
+// CHECK-SPECIALIZE-TABLE-NEXT:     decodeToMCInst
+// CHECK-SPECIALIZE-TABLE-LABEL:      static constexpr DecodeFnTy decodeFnTable[] = {
+// CHECK-SPECIALIZE-TABLE-NEXT:       decodeFn_8bit_0,
+// CHECK-SPECIALIZE-TABLE-NEXT:       decodeFn_8bit_1,
+// CHECK-SPECIALIZE-TABLE-NEXT:     };
+
+// 16 bit deccoder table, functions, and function table.
+// CHECK-SPECIALIZE-TABLE-LABEL:    DecoderTable16[25]
+// CHECK-SPECIALIZE-TABLE:            DecodeIdx: 0
+// CHECK-SPECIALIZE-TABLE:            DecodeIdx: 1
+// CHECK-SPECIALIZE-TABLE:          };
+
+// CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
+// CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 16, DecodeStatus>
+// CHECK-SPECIALIZE-TABLE-NEXT:     decodeFn_16bit_0
+
+// CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
+// CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 16, DecodeStatus>
+// CHECK-SPECIALIZE-TABLE-NEXT:     decodeFn_16bit_1
+
+// CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
+// CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 16, DecodeStatus>
+// CHECK-SPECIALIZE-TABLE-NEXT:     decodeToMCInst
+// CHECK-SPECIALIZE-TABLE-LABEL:      static constexpr DecodeFnTy decodeFnTable[] = {
+// CHECK-SPECIALIZE-TABLE-NEXT:       decodeFn_16bit_0,
+// CHECK-SPECIALIZE-TABLE-NEXT:       decodeFn_16bit_1,
+// CHECK-SPECIALIZE-TABLE-NEXT:     };
diff --git a/llvm/test/TableGen/DecoderEmitterFnTable.td b/llvm/test/TableGen/DecoderEmitterFnTable.td
index 7bed18c19a513..8929e6da716e6 100644
--- a/llvm/test/TableGen/DecoderEmitterFnTable.td
+++ b/llvm/test/TableGen/DecoderEmitterFnTable.td
@@ -71,14 +71,14 @@ def Inst3 : TestInstruction {
   let AsmString = "Inst3";
 }
 
-// CHECK-LABEL: DecodeStatus decodeFn0(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
-// CHECK-LABEL: DecodeStatus decodeFn1(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
-// CHECK-LABEL: DecodeStatus decodeFn2(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
-// CHECK-LABEL: DecodeStatus decodeFn3(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: DecodeStatus decodeFn_0(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: DecodeStatus decodeFn_1(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: DecodeStatus decodeFn_2(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: DecodeStatus decodeFn_3(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
 // CHECK-LABEL: decodeToMCInst(unsigned Idx, DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
 // CHECK: static constexpr DecodeFnTy decodeFnTable[]
-// CHECK-NEXT: decodeFn0,
-// CHECK-NEXT: decodeFn1,
-// CHECK-NEXT: decodeFn2,
-// CHECK-NEXT: decodeFn3,
+// CHECK-NEXT: decodeFn_0,
+// CHECK-NEXT: decodeFn_1,
+// CHECK-NEXT: decodeFn_2,
+// CHECK-NEXT: decodeFn_3,
 // CHECK: return decodeFnTable[Idx](S, insn, MI, Address, Decoder, DecodeComplete)
diff --git a/llvm/test/TableGen/HwModeEncodeDecode3.td b/llvm/test/TableGen/HwModeEncodeDecode3.td
index dbbf866f057e5..5e9ac7d17e45a 100644
--- a/llvm/test/TableGen/HwModeEncodeDecode3.td
+++ b/llvm/test/TableGen/HwModeEncodeDecode3.td
@@ -118,8 +118,6 @@ def unrelated: Instruction {
 // exact duplicates and could effectively be merged into one.
 // DECODER-LABEL: DecoderTable32
 // DECODER-DAG: Opcode: bar
-// DECODER-LABEL: DecoderTable64
-// DECODER-DAG: Opcode: fooTypeEncDefault:foo
 // DECODER-LABEL: DecoderTable_ModeA32
 // DECODER-DAG: Opcode: fooTypeEncA:foo
 // DECODER-DAG: Opcode: bar
@@ -138,13 +136,13 @@ def unrelated: Instruction {
 // DECODER-DAG: Opcode: unrelated
 // DECODER-LABEL: DecoderTableAlt_ModeC32
 // DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTable64
+// DECODER-DAG: Opcode: fooTypeEncDefault:foo
 
 // Under the 'O1' optimization level, unnecessary duplicate tables will be eliminated,
 // reducing the four ‘Alt’ tables down to just one.
 // DECODER-SUPPRESS-O1-LABEL: DecoderTable32
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
-// DECODER-SUPPRESS-O1-LABEL: DecoderTable64
-// DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncDefault:foo
 // DECODER-SUPPRESS-O1-LABEL: DecoderTable_ModeA32
 // DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncA:foo
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
@@ -157,6 +155,8 @@ def unrelated: Instruction {
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
 // DECODER-SUPPRESS-O1-LABEL: DecoderTableAlt32
 // DECODER-SUPPRESS-O1-DAG: Opcode: unrelated
+// DECODER-SUPPRESS-O1-LABEL: DecoderTable64
+// DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncDefault:foo
 
 // Under the 'O2' optimization condition, instructions possessing the 'EncodingByHwMode'
 // attribute will be extracted from their original DecoderNamespace and placed into their
@@ -166,9 +166,6 @@ def unrelated: Instruction {
 // consider the interplay between HwMode and DecoderNamespace for their instructions.
 // DECODER-SUPPRESS-O2-LABEL: DecoderTable32
 // DECODER-SUPPRESS-O2-DAG: Opcode: bar
-// DECODER-SUPPRESS-O2-LABEL: DecoderTable64
-// DECODER-SUPPRESS-O2-NOT: Opcode: bar
-// DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncDefault:foo
 // DECODER-SUPPRESS-O2-LABEL: DecoderTable_ModeA32
 // DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncA:foo
 // DECODER-SUPPRESS-O2-NOT: Opcode: bar
@@ -181,6 +178,9 @@ def unrelated: Instruction {
 // DECODER-SUPPRESS-O2-NOT: Opcode: bar
 // DECODER-SUPPRESS-O2-LABEL: DecoderTableAlt32
 // DECODER-SUPPRESS-O2-DAG: Opcode: unrelated
+// DECODER-SUPPRESS-O2-LABEL: DecoderTable64
+// DECODER-SUPPRESS-O2-NOT: Opcode: bar
+// DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncDefault:foo
 
 // For 'bar' and 'unrelated', we didn't assign any HwModes for them,
 // they should keep the same in the following four tables.
diff --git a/llvm/test/TableGen/VarLenDecoder.td b/llvm/test/TableGen/VarLenDecoder.td
index 769c5895ec3c1..10e254f7673e6 100644
--- a/llvm/test/TableGen/VarLenDecoder.td
+++ b/llvm/test/TableGen/VarLenDecoder.td
@@ -47,6 +47,12 @@ def FOO32 : MyVarInst<MemOp32> {
   );
 }
 
+// Instruction length table
+// CHECK: InstrLenTable
+// CHECK: 27,
+// CHECK-NEXT: 43,
+// CHECK-NEXT: };
+
 // CHECK-SMALL:      /* 0 */       MCD::OPC_ExtractField, 3, 5,  // Inst{7-3} ...
 // CHECK-SMALL-NEXT: /* 3 */       MCD::OPC_FilterValue, 8, 4, 0, // Skip to: 11
 // CHECK-SMALL-NEXT: /* 7 */       MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0, // Opcode: FOO16
@@ -61,11 +67,6 @@ def FOO32 : MyVarInst<MemOp32> {
 // CHECK-LARGE-NEXT: /* 14 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: FOO32
 // CHECK-LARGE-NEXT: };
 
-// Instruction length table
-// CHECK: 27,
-// CHECK-NEXT: 43,
-// CHECK-NEXT: };
-
 // CHECK:      case 0:
 // CHECK-NEXT: tmp = fieldFromInstruction(insn, 8, 3);
 // CHECK-NEXT: if (!Check(S, DecodeRegClassRegisterClass(MI, tmp, Address, Decoder))) { return MCDisassembler::Fail; }
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index e4992b9e9e725..354c2a788d5b1 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -93,6 +94,16 @@ static cl::opt<bool> UseFnTableInDecodeToMCInst(
         "of the generated code."),
     cl::init(false), cl::cat(DisassemblerEmitterCat));
 
+// Enabling this option requires use of different `InsnType` for different
+// bitwidths and defining `InsnBitWidth` template specialization for the
+// `InsnType` types used. Some common specializations are already defined in
+// MCDecoder.h.
+static cl::opt<bool> SpecializeDecodersPerBitwidth(
+    "specialize-decoders-per-bitwidth",
+    cl::desc("Specialize the generated `decodeToMCInst` function per bitwidth. "
+             "Helps reduce the code size."),
+    cl::init(false), cl::cat(DisassemblerEmitterCat));
+
 STATISTIC(NumEncodings, "Number of encodings considered");
 STATISTIC(NumEncodingsLackingDisasm,
           "Number of encodings without disassembler info");
@@ -360,7 +371,8 @@ class DecoderEmitter {
   void emitPredicateFunction(formatted_raw_ostream &OS,
                              PredicateSet &Predicates) const;
   void emitDecoderFunction(formatted_raw_ostream &OS,
-                           DecoderSet &Decoders) const;
+                           const DecoderSet &Decoders,
+                           unsigned BucketBitWidth) const;
 
   // run - Output the code emitter
   void run(raw_ostream &o) const;
@@ -930,7 +942,8 @@ void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
 }
 
 void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
-                                         DecoderSet &Decoders) const {
+                                         const DecoderSet &Decoders,
+                                         unsigned BucketBitWidth) const {
   // The decoder function is just a big switch statement or a table of function
   // pointers based on the input decoder index.
 
@@ -944,12 +957,32 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
       "DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const "
       "MCDisassembler *Decoder, bool &DecodeComplete";
 
+  // Print the name of the decode function to OS.
+  auto PrintDecodeFnName = [&OS, BucketBitWidth](unsigned DecodeIdx) {
+    OS << "decodeFn";
+    if (BucketBitWidth != 0) {
+      OS << '_' << BucketBitWidth << "bit";
+    }
+    OS << '_' << DecodeIdx;
+  };
+
+  // Print the template statement.
+  auto PrintTemplate = [&OS, BucketBitWidth]() {
+    OS << "template <typename InsnType>\n";
+    OS << "static ";
+    if (BucketBitWidth != 0)
+      OS << "std::enable_if_t<InsnBitWidth<InsnType> == " << BucketBitWidth
+         << ", DecodeStatus>\n";
+    else
+      OS << "DecodeStatus ";
+  };
+
   if (UseFnTableInDecodeToMCInst) {
     // Emit a function for each case first.
     for (const auto &[Index, Decoder] : enumerate(Decoders)) {
-      OS << "template <typename InsnType>\n";
-      OS << "static DecodeStatus decodeFn" << Index << "(" << DecodeParams
-         << ") {\n";
+      PrintTemplate();
+      PrintDecodeFnName(Index);
+      OS << "(" << DecodeParams << ") {\n";
       OS << "  using namespace llvm::MCD;\n";
       OS << "  " << TmpTypeDecl;
       OS << "  [[maybe_unused]] TmpType tmp;\n";
@@ -960,9 +993,8 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
   }
 
   OS << "// Handling " << Decoders.size() << " cases.\n";
-  OS << "template <typename InsnType>\n";
-  OS << "static DecodeStatus decodeToMCInst(unsigned Idx, " << DecodeParams
-     << ") {\n";
+  PrintTemplate();
+  OS << "decodeToMCInst(unsigned Idx, " << DecodeParams << ") {\n";
   OS << "  using namespace llvm::MCD;\n";
   OS << "  DecodeComplete = true;\n";
 
@@ -970,12 +1002,14 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
     // Build a table of function pointers
     OS << "  using DecodeFnTy = DecodeStatus (*)(" << DecodeParams << ");\n";
     OS << "  static constexpr DecodeFnTy decodeFnTable[] = {\n";
-    for (size_t Index : llvm::seq(Decoders.size()))
-      OS << "    decodeFn" << Index << ",\n";
+    for (size_t Index : llvm::seq(Decoders.size())) {
+      OS << "    ";
+      PrintDecodeFnName(Index);
+      OS << ",\n";
+    }
     OS << "  };\n";
     OS << "  if (Idx >= " << Decoders.size() << ")\n";
     OS << "    llvm_unreachable(\"Invalid decoder index!\");\n";
-
     OS << "  return decodeFnTable[Idx](S, insn, MI, Address, Decoder, "
           "DecodeComplete);\n";
   } else {
@@ -2448,57 +2482,89 @@ namespace {
 )";
 
   // Do extra bookkeeping for variable-length encodings.
-  std::vector<unsigned> InstrLen;
   bool IsVarLenInst = Target.hasVariableLengthEncodings();
   unsigned MaxInstLen = 0;
   if (IsVarLenInst) {
-    InstrLen.resize(Target.getInstructions().size(), 0);
+    std::vector<unsigned> InstrLen(Target.getInstructions().size(), 0);
     for (const InstructionEncoding &Encoding : Encodings) {
       MaxInstLen = std::max(MaxInstLen, Encoding.getBitWidth());
       InstrLen[Target.getInstrIntValue(Encoding.getInstruction()->TheDef)] =
           Encoding.getBitWidth();
     }
+
+    // For variable instruction, we emit an instruction length table to let the
+    // decoder know how long the instructions are. You can see example usage in
+    // M68k's disassembler.
+    emitInstrLenTable(OS, InstrLen);
   }
 
-  // Map of (namespace, hwmode, size) tuple to encoding IDs.
-  std::map<std::tuple<StringRef, unsigned, unsigned>, std::vector<unsigned>>
-      EncMap;
+  // Map of (bitwidth, namespace, hwmode) tuple to encoding IDs.
+  // Its organized as a nested map, with the (namespace, hwmode) as the key for
+  // the inner map and bitwidth as the key for the outer map. We use std::map
+  // for deterministic iteration order so that the code emitted is also
+  // deterministic.
+  using InnerKeyTy = std::pair<StringRef, unsigned>;
+  using InnerMapTy = std::map<InnerKeyTy, std::vector<unsigned>>;
+  std::map<unsigned, InnerMapTy> EncMap;
+
   for (const auto &[HwModeID, EncodingIDs] : EncodingIDsByHwMode) {
     for (unsigned EncodingID : EncodingIDs) {
       const InstructionEncoding &Encoding = Encodings[EncodingID];
-      const Record *EncodingDef = Encoding.getRecord();
-      unsigned Size = EncodingDef->getValueAsInt("Size");
+      const unsigned BitWidth =
+          IsVarLenInst ? MaxInstLen : Encoding.getBitWidth();
       StringRef DecoderNamespace = Encoding.getDecoderNamespace();
-      EncMap[{DecoderNamespace, HwModeID, Size}].push_back(EncodingID);
+      EncMap[BitWidth][{DecoderNamespace, HwModeID}].push_back(EncodingID);
     }
   }
 
+  // Variable length instructions use the same `APInt` type for all instructions
+  // so we cannot specialize decoders based on instruction bitwidths (which
+  // requires using different `InstType` for differet bitwidths for the correct
+  // template specialization to kick in).
+  if (IsVarLenInst && SpecializeDecodersPerBitwidth)
+    PrintFatalError(
+        "Cannot specialize decoders for variable length instuctions");
+
+  // Entries in `EncMap` are already sorted by bitwidth. So bucketing per
+  // bitwidth can be done on-the-fly as we iterate over the map.
   DecoderTableInfo TableInfo;
   DecoderTableBuilder TableBuilder(Target, Encodings, TableInfo);
   unsigned OpcodeMask = 0;
 
-  for (const auto &[Key, EncodingIDs] : EncMap) {
-    auto [DecoderNamespace, HwModeID, Size] = Key;
-    const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
-    // Emit the decoder for this (namespace, hwmode, width) combination.
-    FilterChooser FC(Encodings, EncodingIDs);
-
-    // The decode table is cleared for each top level decoder function. The
-    // predicates and decoders themselves, however, are shared across all
-    // decoders to give more opportunities for uniqueing.
-    TableInfo.Table.clear();
-    TableBuilder.buildTable(FC);
+  for (const auto &[BitWidth, BWMap] : EncMap) {
+    for (const auto &[Key, EncodingIDs] : BWMap) {
+      auto [DecoderNamespace, HwModeID] = Key;
+
+      // Emit the decoder for this (namespace, hwmode, width) combination.
+      FilterChooser FC(Encodings, EncodingIDs);
+
+      // The decode table is cleared for each top level decoder function. The
+      // predicates and decoders themselves, however, are shared across
+      // different decoders to give more opportunities for uniqueing.
+      //  - If `SpecializeDecodersPerBitwidth` is enabled, decoders are shared
+      //    across all decoder tables for a given bitwidth, else they are shared
+      //    across all decoder tables.
+      //  - predicates are shared across all decoder tables.
+      TableInfo.Table.clear();
+      TableBuilder.buildTable(FC);
+
+      // Print the table to the output stream.
+      OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
+                              BitWidth, EncodingIDs);
+    }
 
-    // Print the table to the output stream.
-    OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
-                            BitWidth, EncodingIDs);
+    // Each BitWidth get's its own decoders and decoder function if
+    // SpecializeDecodersPerBitwidth is enabled.
+    if (SpecializeDecodersPerBitwidth) {
+      emitDecoderFunction(OS, TableInfo.Decoders, BitWidth);
+      TableInfo.Decoders.clear();
+    }
   }
 
-  // For variable instruction, we emit a instruction length table
-  // to let the decoder know how long the instructions are.
-  // You can see example usage in M68k's disassembler.
-  if (IsVarLenInst)
-    emitInstrLenTable(OS, InstrLen);
+  // Emit the decoder function for the last bucket. This will also emit the
+  // single decoder function if SpecializeDecodersPerBitwidth = false.
+  if (!SpecializeDecodersPerBitwidth)
+    emitDecoderFunction(OS, TableInfo.Decoders, 0);
 
   const bool HasCheckPredicate =
       OpcodeMask &
@@ -2508,9 +2574,6 @@ namespace {
   if (HasCheckPredicate)
     emitPredicateFunction(OS, TableInfo.Predicates);
 
-  // Emit the decoder function.
-  emitDecoderFunction(OS, TableInfo.Decoders);
-
   // Emit the main entry point for the decoder, decodeInstruction().
   emitDecodeInstruction(OS, IsVarLenInst, OpcodeMask);
 
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Disassembler/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Disassembler/BUILD.gn
index 11bc537936508..9cc98cd8642d6 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Disassembler/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Disassembler/BUILD.gn
@@ -2,7 +2,12 @@ import("//llvm/utils/TableGen/tablegen.gni")
 
 tablegen("AMDGPUGenDisassemblerTables") {
   visibility = [ ":Disassembler" ]
-  args = [ "-gen-disassembler" ]
+  args = [
+    "-gen-disassembler",
+    "-specialize-decoders-per-bitwidth",
+  ]
+
+  args = [   ]
   td_file = "../AMDGPU.td"
 }
 
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn
index cb579221fd366..447a67af6be7b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn
@@ -2,7 +2,10 @@ import("//llvm/utils/TableGen/tablegen.gni")
 
 tablegen("RISCVGenDisassemblerTables") {
   visibility = [ ":Disassembler" ]
-  args = [ "-gen-disassembler" ]
+  args = [
+    "-gen-disassembler",
+    "-specialize-decoders-per-bitwidth",
+  ]
   td_file = "../RISCV.td"
 }