mudler · mudler · May 22, 2026 · May 22, 2026
diff --git a/core/backend/options.go b/core/backend/options.go
@@ -277,7 +277,7 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
 		MinP:                float32(*c.MinP),
 		Tokens:              int32(*c.Maxtokens),
 		Threads:             int32(*c.Threads),
-		PromptCacheAll:      c.PromptCacheAll,
+		PromptCacheAll:      *c.PromptCacheAll,
 		PromptCacheRO:       c.PromptCacheRO,
 		PromptCachePath:     promptCachePath,
 		F16KV:               *c.F16,

diff --git a/core/config/hooks_test.go b/core/config/hooks_test.go
@@ -136,4 +136,36 @@ var _ = Describe("Backend hooks and parser defaults", func() {
 			Expect(cfg.EngineArgs["enable_chunked_prefill"]).To(Equal(true))
 		})
 	})
+
+	Context("PromptCacheAll default", func() {
+		It("defaults to true when omitted from YAML", func() {
+			cfg := &ModelConfig{}
+			cfg.SetDefaults()
+
+			Expect(cfg.PromptCacheAll).NotTo(BeNil())
+			Expect(*cfg.PromptCacheAll).To(BeTrue())
+		})
+
+		It("preserves an explicit false from YAML", func() {
+			falseV := false
+			cfg := &ModelConfig{
+				LLMConfig: LLMConfig{PromptCacheAll: &falseV},
+			}
+			cfg.SetDefaults()
+
+			Expect(cfg.PromptCacheAll).NotTo(BeNil())
+			Expect(*cfg.PromptCacheAll).To(BeFalse())
+		})
+
+		It("preserves an explicit true from YAML", func() {
+			trueV := true
+			cfg := &ModelConfig{
+				LLMConfig: LLMConfig{PromptCacheAll: &trueV},
+			}
+			cfg.SetDefaults()
+
+			Expect(cfg.PromptCacheAll).NotTo(BeNil())
+			Expect(*cfg.PromptCacheAll).To(BeTrue())
+		})
+	})
 })
diff --git a/core/config/model_config.go b/core/config/model_config.go
@@ -209,7 +209,7 @@ type LLMConfig struct {
 	RMSNormEps      float32  `yaml:"rms_norm_eps,omitempty" json:"rms_norm_eps,omitempty"`
 	NGQA            int32    `yaml:"ngqa,omitempty" json:"ngqa,omitempty"`
 	PromptCachePath string   `yaml:"prompt_cache_path,omitempty" json:"prompt_cache_path,omitempty"`
-	PromptCacheAll  bool     `yaml:"prompt_cache_all,omitempty" json:"prompt_cache_all,omitempty"`
+	PromptCacheAll  *bool    `yaml:"prompt_cache_all,omitempty" json:"prompt_cache_all,omitempty"`
 	PromptCacheRO   bool     `yaml:"prompt_cache_ro,omitempty" json:"prompt_cache_ro,omitempty"`
 	MirostatETA     *float64 `yaml:"mirostat_eta,omitempty" json:"mirostat_eta,omitempty"`
 	MirostatTAU     *float64 `yaml:"mirostat_tau,omitempty" json:"mirostat_tau,omitempty"`
@@ -494,6 +494,13 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Reranking = &falseV
 	}
 
+	if cfg.PromptCacheAll == nil {
+		// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
+		// and let cache_idle_slots / kv_unified actually do useful work; users can
+		// opt out with an explicit `prompt_cache_all: false` in the model YAML.
+		cfg.PromptCacheAll = &trueV
+	}
+
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4