nomic-ai · cebtenzzre · Dec 16, 2023 · Dec 13, 2023 · cebtenzzre · Dec 12, 2023
diff --git a/gpt4all-backend/bert.cpp b/gpt4all-backend/bert.cpp
@@ -714,8 +714,9 @@ Bert::~Bert() {
     bert_free(d_ptr->ctx);
 }
 
-bool Bert::loadModel(const std::string &modelPath)
+bool Bert::loadModel(const std::string &modelPath, int n_ctx)
 {
+    (void)n_ctx;
     d_ptr->ctx = bert_load_from_file(modelPath.c_str());
     d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
     d_ptr->modelLoaded = d_ptr->ctx != nullptr;
@@ -728,8 +729,10 @@ bool Bert::isModelLoaded() const
     return d_ptr->modelLoaded;
 }
 
-size_t Bert::requiredMem(const std::string &/*modelPath*/)
+size_t Bert::requiredMem(const std::string &modelPath, int n_ctx)
 {
+    (void)modelPath;
+    (void)n_ctx;
     return 0;
 }
 

diff --git a/gpt4all-backend/bert_impl.h b/gpt4all-backend/bert_impl.h
@@ -18,9 +18,9 @@ class Bert : public LLModel {
 
     bool supportsEmbedding() const override { return true; }
     bool supportsCompletion() const override { return true; }
-    bool loadModel(const std::string &modelPath) override;
+    bool loadModel(const std::string &modelPath, int n_ctx) override;
     bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath) override;
+    size_t requiredMem(const std::string &modelPath, int n_ctx) override;
     size_t stateSize() const override;
     size_t saveState(uint8_t *dest) const override;
     size_t restoreState(const uint8_t *src) override;

diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp
@@ -676,15 +676,17 @@ GPTJ::GPTJ()
     d_ptr->modelLoaded = false;
 }
 
-size_t GPTJ::requiredMem(const std::string &modelPath) {
+size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx) {
+    (void)n_ctx;
     gptj_model dummy_model;
     gpt_vocab dummy_vocab;
     size_t mem_req;
     gptj_model_load(modelPath, dummy_model, dummy_vocab, &mem_req);
     return mem_req;
 }
 
-bool GPTJ::loadModel(const std::string &modelPath) {
+bool GPTJ::loadModel(const std::string &modelPath, int n_ctx) {
+    (void)n_ctx;
     std::mt19937 rng(time(NULL));
     d_ptr->rng = rng;
 

diff --git a/gpt4all-backend/gptj_impl.h b/gpt4all-backend/gptj_impl.h
@@ -17,9 +17,9 @@ class GPTJ : public LLModel {
 
     bool supportsEmbedding() const override { return false; }
     bool supportsCompletion() const override { return true; }
-    bool loadModel(const std::string &modelPath) override;
+    bool loadModel(const std::string &modelPath, int n_ctx) override;
     bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath) override;
+    size_t requiredMem(const std::string &modelPath, int n_ctx) override;
     size_t stateSize() const override;
     size_t saveState(uint8_t *dest) const override;
     size_t restoreState(const uint8_t *src) override;

diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
@@ -120,7 +120,8 @@ struct llama_file_hparams {
     enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
 };
 
-size_t LLamaModel::requiredMem(const std::string &modelPath) {
+size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx) {
+    // TODO(cebtenzzre): update to GGUF
     auto fin = std::ifstream(modelPath, std::ios::binary);
     fin.seekg(0, std::ios_base::end);
     size_t filesize = fin.tellg();
@@ -137,40 +138,31 @@ size_t LLamaModel::requiredMem(const std::string &modelPath) {
     fin.read(reinterpret_cast<char*>(&hparams.n_layer), sizeof(hparams.n_layer));
     fin.read(reinterpret_cast<char*>(&hparams.n_rot), sizeof(hparams.n_rot));
     fin.read(reinterpret_cast<char*>(&hparams.ftype), sizeof(hparams.ftype));
-    const size_t n_ctx = 2048;
     const size_t kvcache_element_size = 2; // fp16
     const size_t est_kvcache_size = hparams.n_embd * hparams.n_layer * 2u * n_ctx * kvcache_element_size;
     return filesize + est_kvcache_size;
 }
 
-bool LLamaModel::loadModel(const std::string &modelPath)
+bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
 {
     gpt_params params;
 
-    // load the model
+    if (n_ctx < 8) {
+        std::cerr << "warning: minimum context size is 8, using minimum size.\n";
+        n_ctx = 8;
+    }
+
+    // -- load the model --
+
     d_ptr->model_params = llama_model_default_params();
 
-    d_ptr->model_params.use_mmap   = params.use_mmap;
+    d_ptr->model_params.use_mmap  = params.use_mmap;
 #if defined (__APPLE__)
-    d_ptr->model_params.use_mlock  = true;
+    d_ptr->model_params.use_mlock = true;
 #else
-    d_ptr->model_params.use_mlock  = params.use_mlock;
+    d_ptr->model_params.use_mlock = params.use_mlock;
 #endif
 
-    d_ptr->ctx_params = llama_context_default_params();
-
-    d_ptr->ctx_params.n_ctx  = 2048;
-    d_ptr->ctx_params.seed   = params.seed;
-    d_ptr->ctx_params.f16_kv = params.memory_f16;
-
-    // The new batch API provides space for n_vocab*n_tokens logits. Tell llama.cpp early
-    // that we want this many logits so the state serializes consistently.
-    d_ptr->ctx_params.logits_all = true;
-
-    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    d_ptr->ctx_params.n_threads       = d_ptr->n_threads;
-    d_ptr->ctx_params.n_threads_batch = d_ptr->n_threads;
-
 #ifdef GGML_USE_METAL
     if (llama_verbose()) {
         std::cerr << "llama.cpp: using Metal" << std::endl;
@@ -197,6 +189,28 @@ bool LLamaModel::loadModel(const std::string &modelPath)
         return false;
     }
 
+    const int n_ctx_train = llama_n_ctx_train(d_ptr->model);
+    if (n_ctx > n_ctx_train) {
+        std::cerr << "warning: model was trained on only " << n_ctx_train << " context tokens ("
+                  << n_ctx << " specified)\n";
+    }
+
+    // -- initialize the context --
+
+    d_ptr->ctx_params = llama_context_default_params();
+
+    d_ptr->ctx_params.n_ctx  = n_ctx;
+    d_ptr->ctx_params.seed   = params.seed;
+    d_ptr->ctx_params.f16_kv = params.memory_f16;
+
+    // The new batch API provides space for n_vocab*n_tokens logits. Tell llama.cpp early
+    // that we want this many logits so the state serializes consistently.
+    d_ptr->ctx_params.logits_all = true;
+
+    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    d_ptr->ctx_params.n_threads       = d_ptr->n_threads;
+    d_ptr->ctx_params.n_threads_batch = d_ptr->n_threads;
+
     d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params);
     if (!d_ptr->ctx) {
 #ifdef GGML_USE_KOMPUTE

diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h
@@ -17,9 +17,9 @@ class LLamaModel : public LLModel {
 
     bool supportsEmbedding() const override { return false; }
     bool supportsCompletion() const override { return true; }
-    bool loadModel(const std::string &modelPath) override;
+    bool loadModel(const std::string &modelPath, int n_ctx) override;
     bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath) override;
+    size_t requiredMem(const std::string &modelPath, int n_ctx) override;
     size_t stateSize() const override;
     size_t saveState(uint8_t *dest) const override;
     size_t restoreState(const uint8_t *src) override;

diff --git a/gpt4all-backend/llmodel.cpp b/gpt4all-backend/llmodel.cpp
@@ -138,7 +138,7 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
     return nullptr;
 }
 
-LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::string buildVariant) {
+LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::string buildVariant, int n_ctx) {
     if (!has_at_least_minimal_hardware()) {
         std::cerr << "LLModel ERROR: CPU does not support AVX\n";
         return nullptr;
@@ -154,7 +154,11 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
             if(impl) {
                 LLModel* metalimpl = impl->m_construct();
                 metalimpl->m_implementation = impl;
-                size_t req_mem = metalimpl->requiredMem(modelPath);
+                /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
+                 * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
+                 * most (all?) places where this is called, causing underestimation of required
+                 * memory. */
+                size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx);
                 float req_to_total = (float) req_mem / (float) total_mem;
                 // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
                 if (req_to_total >= 0.53) {
@@ -165,6 +169,8 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
                 }
             }
         }
+    #else
+        (void)n_ctx;
     #endif
 
     if (!impl) {

diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h
@@ -37,7 +37,7 @@ class LLModel {
         static bool isImplementation(const Dlhandle&);
         static const std::vector<Implementation>& implementationList();
         static const Implementation *implementation(const char *fname, const std::string& buildVariant);
-        static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto");
+        static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
         static std::vector<GPUDevice> availableGPUDevices();
         static void setImplementationsSearchPath(const std::string& path);
         static const std::string& implementationsSearchPath();
@@ -74,9 +74,9 @@ class LLModel {
 
     virtual bool supportsEmbedding() const = 0;
     virtual bool supportsCompletion() const = 0;
-    virtual bool loadModel(const std::string &modelPath) = 0;
+    virtual bool loadModel(const std::string &modelPath, int n_ctx) = 0;
     virtual bool isModelLoaded() const = 0;
-    virtual size_t requiredMem(const std::string &modelPath) = 0;
+    virtual size_t requiredMem(const std::string &modelPath, int n_ctx) = 0;
     virtual size_t stateSize() const { return 0; }
     virtual size_t saveState(uint8_t */*dest*/) const { return 0; }
     virtual size_t restoreState(const uint8_t */*src*/) { return 0; }

diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp
@@ -47,16 +47,16 @@ void llmodel_model_destroy(llmodel_model model) {
     delete reinterpret_cast<LLModelWrapper*>(model);
 }
 
-size_t llmodel_required_mem(llmodel_model model, const char *model_path)
+size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx)
 {
     LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
-    return wrapper->llModel->requiredMem(model_path);
+    return wrapper->llModel->requiredMem(model_path, n_ctx);
 }
 
-bool llmodel_loadModel(llmodel_model model, const char *model_path)
+bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx)
 {
     LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
-    return wrapper->llModel->loadModel(model_path);
+    return wrapper->llModel->loadModel(model_path, n_ctx);
 }
 
 bool llmodel_isModelLoaded(llmodel_model model)

diff --git a/gpt4all-backend/llmodel_c.h b/gpt4all-backend/llmodel_c.h
@@ -110,17 +110,19 @@ void llmodel_model_destroy(llmodel_model model);
  * Estimate RAM requirement for a model file
  * @param model A pointer to the llmodel_model instance.
  * @param model_path A string representing the path to the model file.
+ * @param n_ctx Maximum size of context window
  * @return size greater than 0 if the model was parsed successfully, 0 if file could not be parsed.
  */
-size_t llmodel_required_mem(llmodel_model model, const char *model_path);
+size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx);
 
 /**
  * Load a model from a file.
  * @param model A pointer to the llmodel_model instance.
  * @param model_path A string representing the path to the model file.
+ * @param n_ctx Maximum size of context window
  * @return true if the model was loaded successfully, false otherwise.
  */
-bool llmodel_loadModel(llmodel_model model, const char *model_path);
+bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx);
 
 /**
  * Check if a model is loaded.

diff --git a/gpt4all-bindings/csharp/Gpt4All/Bindings/LLModel.cs b/gpt4all-bindings/csharp/Gpt4All/Bindings/LLModel.cs
@@ -188,7 +188,7 @@ public bool IsLoaded()
     /// <returns>true if the model was loaded successfully, false otherwise.</returns>
     public bool Load(string modelPath)
     {
-        return NativeMethods.llmodel_loadModel(_handle, modelPath);
+        return NativeMethods.llmodel_loadModel(_handle, modelPath, 2048);
     }
 
     protected void Destroy()

diff --git a/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeMethods.cs b/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeMethods.cs
@@ -70,7 +70,8 @@ internal static unsafe partial class NativeMethods
     [return: MarshalAs(UnmanagedType.I1)]
     public static extern bool llmodel_loadModel(
         [NativeTypeName("llmodel_model")] IntPtr model,
-        [NativeTypeName("const char *")][MarshalAs(UnmanagedType.LPUTF8Str)] string model_path);
+        [NativeTypeName("const char *")][MarshalAs(UnmanagedType.LPUTF8Str)] string model_path,
+        [NativeTypeName("int32_t")] int n_ctx);
 
     [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
 

diff --git a/gpt4all-bindings/csharp/Gpt4All/Model/Gpt4AllModelFactory.cs b/gpt4all-bindings/csharp/Gpt4All/Model/Gpt4AllModelFactory.cs
@@ -39,7 +39,7 @@ private IGpt4AllModel CreateModel(string modelPath)
         var handle = NativeMethods.llmodel_model_create2(modelPath, "auto", out error);
         _logger.LogDebug("Model created handle=0x{ModelHandle:X8}", handle);
         _logger.LogInformation("Model loading started");
-        var loadedSuccessfully = NativeMethods.llmodel_loadModel(handle, modelPath);
+        var loadedSuccessfully = NativeMethods.llmodel_loadModel(handle, modelPath, 2048);
         _logger.LogInformation("Model loading completed success={ModelLoadSuccess}", loadedSuccessfully);
         if (!loadedSuccessfully)
         {

diff --git a/gpt4all-bindings/golang/binding.cpp b/gpt4all-bindings/golang/binding.cpp
@@ -23,7 +23,7 @@ void* load_model(const char *fname, int n_threads) {
         fprintf(stderr, "%s: error '%s'\n", __func__, new_error);
         return nullptr;
     }
-    if (!llmodel_loadModel(model, fname)) {
+    if (!llmodel_loadModel(model, fname, 2048)) {
         llmodel_model_destroy(model);
         return nullptr;
     }

diff --git a/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModel.java b/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModel.java
@@ -195,7 +195,7 @@ public LLModel(Path modelPath) {
         if(model == null) {
             throw new IllegalStateException("Could not load, gpt4all backend returned error: " + error.getValue().getString(0));
         }
-        library.llmodel_loadModel(model, modelPathAbs);
+        library.llmodel_loadModel(model, modelPathAbs, 2048);
 
         if(!library.llmodel_isModelLoaded(model)){
             throw new IllegalStateException("The model " + modelName + " could not be loaded");

diff --git a/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModelLibrary.java b/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModelLibrary.java
@@ -61,7 +61,7 @@ public LLModelPromptContext(jnr.ffi.Runtime runtime) {
 
     Pointer llmodel_model_create2(String model_path, String build_variant, PointerByReference error);
     void llmodel_model_destroy(Pointer model);
-    boolean llmodel_loadModel(Pointer model, String model_path);
+    boolean llmodel_loadModel(Pointer model, String model_path, int n_ctx);
     boolean llmodel_isModelLoaded(Pointer model);
     @u_int64_t long llmodel_get_state_size(Pointer model);
     @u_int64_t long llmodel_save_state_data(Pointer model, Pointer dest);

diff --git a/gpt4all-bindings/python/gpt4all/__init__.py b/gpt4all-bindings/python/gpt4all/__init__.py
@@ -1,2 +1,2 @@
-from .gpt4all import Embed4All, GPT4All  # noqa
-from .pyllmodel import LLModel  # noqa
+from .gpt4all import Embed4All as Embed4All, GPT4All as GPT4All
+from .pyllmodel import LLModel as LLModel
diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -69,6 +69,7 @@ def __init__(
         allow_download: bool = True,
         n_threads: Optional[int] = None,
         device: Optional[str] = "cpu",
+        n_ctx: int = 2048,
         verbose: bool = False,
     ):
         """
@@ -90,15 +91,16 @@ def __init__(
                 Default is "cpu".
 
                 Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
+            n_ctx: Maximum size of context window
+            verbose: If True, print debug messages.
         """
         self.model_type = model_type
         self.model = pyllmodel.LLModel()
         # Retrieve model and download if allowed
         self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
-        if device is not None:
-            if device != "cpu":
-                self.model.init_gpu(model_path=self.config["path"], device=device)
-        self.model.load_model(self.config["path"])
+        if device is not None and device != "cpu":
+            self.model.init_gpu(model_path=self.config["path"], device=device, n_ctx=n_ctx)
+        self.model.load_model(self.config["path"], n_ctx)
         # Set n_threads
         if n_threads is not None:
             self.model.set_thread_count(n_threads)