ml-explore · davidkoski · Oct 16, 2025 · Oct 13, 2025 · davidkoski · Oct 13, 2025
diff --git a/Libraries/Embedders/BaseConfiguration.swift b/Libraries/Embedders/BaseConfiguration.swift
@@ -1,6 +1,7 @@
 // Copyright © 2025 Apple Inc.
 
 import Foundation
+import MLX
 
 /// Base ``LanguageModel`` configuration -- provides `modelType`
 /// and `quantization` (used in loading the model).
@@ -18,12 +19,15 @@ public struct BaseConfiguration: Codable, Sendable {
 
         public let groupSize: Int
         public let bits: Int
+        private var _mode: QuantizationMode? = nil
+        public var mode: QuantizationMode { _mode ?? .affine }
 
-        public var asTuple: (Int, Int) { (groupSize, bits) }
+        public var asTuple: (Int, Int, QuantizationMode) { (groupSize, bits, mode) }
 
         enum CodingKeys: String, CodingKey {
             case groupSize = "group_size"
             case bits = "bits"
+            case _mode = "mode"
         }
     }
 

diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift
@@ -340,6 +340,11 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {
         defaultPrompt: ""
     )
 
+    static public let gpt_oss_20b_MXFP4_Q8 = ModelConfiguration(
+        id: "mlx-community/gpt-oss-20b-MXFP4-Q8",
+        defaultPrompt: "Why is the sky blue?"
+    )
+
     private static func all() -> [ModelConfiguration] {
         [
             codeLlama13b4bit,
@@ -389,6 +394,7 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {
             ling_mini_2_2bit,
             lfm2_8b_a1b_3bit_mlx,
             nanochat_d20_mlx,
+            gpt_oss_20b_MXFP4_Q8,
         ]
     }
 

diff --git a/Libraries/MLXLLM/Models/GPTOSS.swift b/Libraries/MLXLLM/Models/GPTOSS.swift
@@ -305,7 +305,8 @@ private class AttentionBlock: Module {
                 scale: smScale,
                 mask: .array(mask),
                 groupSize: qcache.groupSize,
-                bits: qcache.bits
+                bits: qcache.bits,
+                mode: qcache.mode
             )
 
             return oProj(vHat.swappedAxes(1, 2).reshaped(B, L, -1))

diff --git a/Libraries/MLXLLM/SwitchLayers.swift b/Libraries/MLXLLM/SwitchLayers.swift
@@ -142,24 +142,28 @@ class SwitchLinear: Module, Quantizable {
         return result
     }
 
-    func toQuantized(groupSize: Int = 64, bits: Int = 4) -> Module {
-        QuantizedSwitchLinear(self, groupSize: groupSize, bits: bits)
+    func toQuantized(groupSize: Int = 64, bits: Int = 4, mode: QuantizationMode) -> Module {
+        QuantizedSwitchLinear(self, groupSize: groupSize, bits: bits, mode: mode)
     }
 }
 
 class QuantizedSwitchLinear: SwitchLinear, Quantized {
     @ModuleInfo(key: "scales") var scales: MLXArray
-    @ModuleInfo(key: "biases") var biases: MLXArray
+    @ModuleInfo(key: "biases") var biases: MLXArray?
 
     let groupSize: Int
     let bits: Int
+    let mode: QuantizationMode
 
-    init(_ other: SwitchLinear, groupSize: Int = 64, bits: Int = 4) {
+    init(
+        _ other: SwitchLinear, groupSize: Int = 64, bits: Int = 4, mode: QuantizationMode = .affine
+    ) {
         self.groupSize = groupSize
         self.bits = bits
+        self.mode = mode
 
         let (quantizedWeight, scales, biases) = MLX.quantized(
-            other.weight, groupSize: groupSize, bits: bits)
+            other.weight, groupSize: groupSize, bits: bits, mode: mode)
 
         self._scales.wrappedValue = scales
         self._biases.wrappedValue = biases
@@ -183,6 +187,7 @@ class QuantizedSwitchLinear: SwitchLinear, Quantized {
             transpose: true,
             groupSize: self.groupSize,
             bits: self.bits,
+            mode: mode,
             sortedIndices: sortedIndices
         )
 

diff --git a/Libraries/MLXLMCommon/Adapters/LoRA/DoRA+Layers.swift b/Libraries/MLXLMCommon/Adapters/LoRA/DoRA+Layers.swift
@@ -147,7 +147,8 @@ public class QDoRALinear: QuantizedLinear, LoRALayer {
         super.init(
             weight: linear.weight, bias: linear.bias,
             scales: linear.scales, biases: linear.biases,
-            groupSize: linear.groupSize, bits: linear.bits
+            groupSize: linear.groupSize, bits: linear.bits,
+            mode: linear.mode
         )
 
         freeze()
@@ -171,7 +172,8 @@ public class QDoRALinear: QuantizedLinear, LoRALayer {
 
     public override func callAsFunction(_ x: MLXArray) -> MLXArray {
         let y = quantizedMatmul(
-            x, weight, scales: scales, biases: biases, groupSize: groupSize, bits: bits)
+            x, weight, scales: scales, biases: biases, groupSize: groupSize, bits: bits,
+            mode: mode)
         return forward(
             x: x, y: y,
             weight: dequantizedWeight, bias: bias,

diff --git a/Libraries/MLXLMCommon/Adapters/LoRA/LoRAModel.swift b/Libraries/MLXLMCommon/Adapters/LoRA/LoRAModel.swift
@@ -79,7 +79,8 @@ extension QuantizedLinear {
             scales: scales,
             biases: biases,
             groupSize: groupSize,
-            bits: bits
+            bits: bits,
+            mode: mode
         )
     }
 }
diff --git a/Libraries/MLXLMCommon/AttentionUtils.swift b/Libraries/MLXLMCommon/AttentionUtils.swift
@@ -52,7 +52,7 @@ public func attentionWithCacheUpdate(
             mask: mask
         )
     }
-    if let quantizedKVCache = cache as? QuantizedKVCache {
+    if let quantizedKVCache = cache as? QuantizedKVCacheProtocol {
         let (quantizedKeys, quantizedValues) = quantizedKVCache.updateQuantized(
             keys: keys, values: values)
         return quantizedScaledDotProductAttention(
@@ -62,7 +62,8 @@ public func attentionWithCacheUpdate(
             scale: scale,
             mask: mask,
             groupSize: quantizedKVCache.groupSize,
-            bits: quantizedKVCache.bits
+            bits: quantizedKVCache.bits,
+            mode: quantizedKVCache.mode
         )
     } else {
         let (cachedKeys, cachedValues) = cache.update(keys: keys, values: values)

diff --git a/Libraries/MLXLMCommon/BaseConfiguration.swift b/Libraries/MLXLMCommon/BaseConfiguration.swift
@@ -1,6 +1,7 @@
 // Copyright © 2025 Apple Inc.
 
 import Foundation
+import MLX
 
 /// Base ``LanguageModel`` configuration -- provides `modelType`
 /// and `quantization` (used in loading the model).
@@ -18,20 +19,15 @@ public struct BaseConfiguration: Codable, Sendable {
 
         public let groupSize: Int
         public let bits: Int
-        public var quantMethod: String? = nil
-        public var linearClass: String? = nil
-        public var quantizationMode: String? = nil
-        public var mode: String? = nil
+        private var _mode: QuantizationMode? = nil
+        public var mode: QuantizationMode { _mode ?? .affine }
 
-        public var asTuple: (Int, Int) { (groupSize, bits) }
+        public var asTuple: (Int, Int, QuantizationMode) { (groupSize, bits, mode) }
 
         enum CodingKeys: String, CodingKey {
             case groupSize = "group_size"
             case bits = "bits"
-            case quantMethod = "quant_method"
-            case linearClass = "linear_class"
-            case quantizationMode = "quantization_mode"
-            case mode = "mode"
+            case _mode = "mode"
         }
     }
 
@@ -115,10 +111,11 @@ public struct BaseConfiguration: Codable, Sendable {
                 switch key.stringValue {
                 case Quantization.CodingKeys.groupSize.rawValue: continue
                 case Quantization.CodingKeys.bits.rawValue: continue
-                case Quantization.CodingKeys.quantMethod.rawValue: continue
-                case Quantization.CodingKeys.linearClass.rawValue: continue
-                case Quantization.CodingKeys.quantizationMode.rawValue: continue
-                case Quantization.CodingKeys.mode.rawValue: continue
+                case Quantization.CodingKeys._mode.rawValue: continue
+
+                // additional keys that are not layer instructions, see
+                // mlx-community/bitnet-b1.58-2B-4T-4bit
+                case "quant_method", "linear_class", "quantization_mode": continue
 
                 default:
                     if let f = try? container.decode(Bool.self, forKey: key) {