From ab0cba5d188652f37497045ed572ee49c17674d6 Mon Sep 17 00:00:00 2001
From: nicoboss <nico@bosshome.ch>
Date: Sat, 7 Jun 2025 16:22:56 +0200
Subject: [PATCH] Statically quant tensors where the imatrix size differs and
 fix "Missing importance matrix for tensor XXX in a very low-bit quantization"
 by instead using the nearest static quant with better quality than the
 specified target quant

---
 src/llama-quant.cpp | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 159b1307a4c..f0f24d3198b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -846,12 +846,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                         // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
                         // tok_embd should be ignored in this case, since it always causes this warning
                         if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
-                            throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
-                                    int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
+                            LLAMA_LOG_INFO("imatrix size %d is different from tensor size %d for %s", int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
                         }
                     }
                 }
             }
+            if ((new_type == GGML_TYPE_IQ2_XXS ||
+                 new_type == GGML_TYPE_IQ1_S   ||
+                (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight"))) && !imatrix) {
+                LLAMA_LOG_INFO("\n\n============================================================\n");
+                LLAMA_LOG_INFO("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
+                LLAMA_LOG_INFO("The result will be garbage, so using GGML_TYPE_Q2_K\n");
+                LLAMA_LOG_INFO("============================================================\n\n");
+                new_type = GGML_TYPE_Q2_K;
+                //throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
+            }
+
+            if ((new_type == GGML_TYPE_IQ2_XS  ||
+                 new_type == GGML_TYPE_IQ2_S   ||
+                (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
+                LLAMA_LOG_INFO("\n\n============================================================\n");
+                LLAMA_LOG_INFO("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
+                LLAMA_LOG_INFO("The result will be garbage, so using GGML_TYPE_Q3_K\n");
+                LLAMA_LOG_INFO("============================================================\n\n");
+                new_type = GGML_TYPE_Q3_K;
+            }
+
             if ((new_type == GGML_TYPE_IQ2_XXS ||
                  new_type == GGML_TYPE_IQ2_XS  ||
                  new_type == GGML_TYPE_IQ2_S   ||