From 07a2e38bd18c8a1b2529a61813e3f0e18b1866a2 Mon Sep 17 00:00:00 2001
From: KPRoche <kproche@us.ibm.com>
Date: Tue, 20 May 2025 15:01:41 -0700
Subject: [PATCH 1/2] added KV Cache

---
 docs/architecture/Components/06_kvcache.md | 177 +++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 docs/architecture/Components/06_kvcache.md

diff --git a/docs/architecture/Components/06_kvcache.md b/docs/architecture/Components/06_kvcache.md
new file mode 100644
index 0000000..7bdc559
--- /dev/null
+++ b/docs/architecture/Components/06_kvcache.md
@@ -0,0 +1,177 @@
+---
+sidecar_position: 06
+sidecar_label: KVCache Manager
+---
+# KVCache Manager
+
+## Introduction
+
+LLM inference can be computationally expensive due to the sequential nature of token generation. 
+KV-caching plays a critical role in optimizing this process. By storing previously computed key and value attention vectors, 
+KVCache reuse avoids redundant computations during inference, significantly reducing latency and resource consumption. 
+This is particularly beneficial for long context multi-turn conversations or Agentic (&RAG) applications where 
+previously computed information can be leveraged effectively. 
+Efficient KVCache management and routing are essential for scaling LLM inference and delivering a responsive user experience.
+
+llmd-kv-cache-manager is a pluggable KVCache Manager for KVCache Aware Routing in vLLM-based serving platforms.
+
+See the [docs folder in the repository](https://github.com/llm-d/llm-d-kv-cache-manager/blob/main/docs/README.md) for more information on goals, architecture and more.
+## Overview
+
+The code defines a [KVCacheIndexer](https://github.com/llm-d/llm-d-kv-cache-manager/tree/main/pkg/kv-cache/indexer.go) module that efficiently maintains a global view of KVCache states and localities. 
+In the current state of vLLM, the only available information on KVCache availability is that of the offloaded tensors to KVCache Engines via the Connector API.
+
+The `kvcache.Indexer` module is a pluggable Go package designed for use by orchestrators to enable KVCache-aware scheduling decisions.
+
+```mermaid
+graph 
+  subgraph Cluster
+    Router
+    subgraph KVCacheManager[KVCache Manager]
+      KVCacheIndexer[KVCache Indexer]
+      PrefixStore[LRU Prefix Store]
+      KVBlockToPodIndex[KVBlock to Pod availability Index]
+    end
+    subgraph vLLMNode[vLLM Node]
+      vLLMCore[vLLM Core]
+      KVCacheEngine["KVCache Engine (LMCache)"]
+    end
+    Redis
+  end
+
+  Router -->|"Score(prompt, ModelName, relevantPods)"| KVCacheIndexer
+  KVCacheIndexer -->|"{Pod to Scores map}"| Router
+  Router -->|Route| vLLMNode
+  
+  KVCacheIndexer -->|"FindLongestTokenizedPrefix(prompt, ModelName) -> tokens"| PrefixStore
+  PrefixStore -->|"DigestPromptAsync"| PrefixStore
+  KVCacheIndexer -->|"GetPodsForKeys(tokens) -> {KVBlock keys to Pods} availability map"| KVBlockToPodIndex
+  KVBlockToPodIndex -->|"Redis MGet(blockKeys) -> {KVBlock keys to Pods}"| Redis
+
+  vLLMCore -->|Connector API| KVCacheEngine
+  KVCacheEngine -->|"UpdateIndex(KVBlock keys, nodeIP)"| Redis
+```
+This overview greatly simplifies the actual architecture and combines steps across several submodules.
+
+## Architecture 
+
+For even more a detailed architecture, refer to the [architecture](https://github.com/llm-d/llm-d-kv-cache-manager/tree/main/docs/architecture.md) document.
+
+The architecture is designed to efficiently maintain a global view of KVCache states and localities, enabling KVCache-aware scheduling decisions.
+
+### Detailed System Flow
+
+```mermaid
+sequenceDiagram
+    participant U as User  
+    participant KVI as KVCacheIndexer
+    box
+        participant KVBS as KVBlockScorer
+        participant TPR as TokenProcessor
+        participant KVBI as KVBlockIndexer
+        participant Redis as Redis
+    end
+    box
+        participant PS as PrefixStore
+        participant LRUS as LRUStore
+        participant TS as TrieStore
+    end
+    box
+        participant TPO as TokenizersPool
+        participant W as Worker
+        participant CHT as HuggingFaceTokenizer
+        participant CH as TokenizersCache
+    end
+
+# KVCacheIndexer
+U->>KVI: 1. Score(prompt, ModelName, relevantPods)
+
+# get available tokens of longest prefix
+KVI->>PS: 2. FindLongestTokenizedPrefix(prompt, ModelName)
+    alt LRU
+        PS->>LRUS: 2.1 BuildLongestPrefix(prompt, ModelName)
+    else Trie
+        PS->>TS: 2.1 BuildLongestPrefix(prompt, ModelName)
+    end
+PS->>KVI: 2.2 Tokens of longest prefix
+
+# get block keys  
+KVI->>TPR: 3 GetBlockKeys(tokens, ModelName)
+    TPR->>KVI: 3.1 BlockKeys
+
+# query kvblock indexer for pods
+KVI->>KVBI: 4. GetPodsForKeys(blockKeys, relevantPods)
+KVBI->>Redis: 4.1 MGet(blockKeys)
+Redis->>KVBI: 4.2 key -> Pods mapping (KVCache availability)
+KVBI->>KVBI: 4.3 FilterPods(relevantPods)
+
+# score pods
+KVI->>KVBS: 5. ScorePods(key->Pods) based on strategy
+
+# results
+KVI->>U: 6. Pod -> Score mapping
+
+# add to tokenizers pool
+KVI->>TPO: 2. AddTask(prompt, ModelName) // Registers task only
+Note over TPO: Task added to queue
+W-->>TPO: 2.1 Get(Task) // Async worker fetches task
+W->>CHT: 2.3 Tokenize(prompt, ModelName)
+CHT->>CH: 2.4 GetCachedTokenizerForModel()
+CHT->>W: 2.5 Tokens
+W->>PS: 2.6 AddTokens(prompt, ModelName, tokens)
+alt LRU
+    PS->>LRUS: 2.7 AddTokens(prompt, ModelName, tokens)
+else Trie
+    PS->>TS: 2.7 AddTokens(prompt, ModelName, tokens)
+end
+```
+
+### Explanation
+The main blocking sequence of steps that happens when a user (e.g., router) sends a request to the KVCacheIndexer is as follows:
+1. **User** sends a request to the **KVCacheIndexer** with a prompt, model name, and relevant pods.
+2. **KVCacheIndexer**:
+   - Finds the longest tokenized prefix for the prompt and model name using the **PrefixStore**.
+      - Depending on the store type (LRU or Trie), it gets the tokenization of the longest cached prefix
+   - Adds a tokenization task to the **TokenizersPool**, which is handled asynchronously by a worker. This bit is explained later.
+3. **KVCacheIndexer** queries the **TokenProcessor** to get block keys for the tokens of the longest prefix.
+4. **TokenProcessor**:
+   - Chunks the tokens and generate keys for the token blocks. The chunking and key calculating has to be aligned with
+     the source that feeds the key -> pods backend (Redis).
+   - Returns the block keys to the **KVCacheIndexer**.
+5. **KVCacheIndexer** queries the **KVBlockIndexer** for pods that have the block keys.
+   - The **KVBlockIndexer** queries the **Redis** backend for the mappings with MGet.
+   - The **Redis** backend efficiently returns the key -> pods mapping.
+6. **KVCacheIndexer** uses the configured **KVBlockScorer** to score the pods based block hits:
+    - LongestPrefixMatch: scores by the longest consecutive (ordered) block hits in a single pod.
+    - HighestBlockHit: scores by the index of the highest block hit in a single pod.
+    - CoverageBasedMatching: scores by the total number of block hits in a single pod.
+
+Asynchronous tokenization flow:
+1. A worker fetches the task from the **TokenizersPool**.
+2. The worker tokenizes the prompt using the **HuggingFaceTokenizer**.
+3. The **HuggingFaceTokenizer** retrieves the cached in-memory tokenizer for the model.
+    - If the tokenizer is not cached, it gets created and cached.
+4. The **HuggingFaceTokenizer** returns the tokens to the worker.
+5. The worker adds the tokens to the **PrefixStore**.
+    - Depending on the store type (LRU or Trie), it adds the tokens to the appropriate store:
+      - LRUStore: an LRU HashTable of prompt-chunks to tokens
+      - TrieStore: a Trie of characters to tokens
+    - Due to the nature of how tokenizers operate, the tokenization of a prefix of a prompt is a prefix of the tokenization of the full prompt.
+        One challenge in tokenization is that different chunks of a prompt map to different tokens.
+        Therefore, when we chunk a prompt, we use the [_, end] index associated with the tokens to contain token in a chunk.
+        The implication of this design is that the tokens contained in a chunk are only correct if all previous chunks are also considered,
+        since one token may be associated with the edge-characters of two consecutive chunks.
+
+### Maintenance of Redis for KVBlock -> Pods Mapping
+In the current phase, LMCache is set up to use the same Redis server for indexing. For the scope of LMCache, this indexing
+is necessary for KVCache reuse through offloading and sharing.
+
+
+
+## Examples
+
+- [KVCache Indexer](https://github.com/llm-d/llm-d-kv-cache-manager/tree/main/examples/kv-cache-index/): 
+  - A reference implementation of using the `kvcache.Indexer` module.
+- [KVCache Aware Scorer](https://github.com/llm-d/llm-d-kv-cache-manager/tree/main/examples/kv-cache-aware-scorer/): 
+  - A reference implementation of integrating the `kvcache.Indexer` module in 
+  [llm-d-inference-scheduler](https://github.com/llm-d/llm-d-inference-scheduler) in a KVCache aware scorer.

From cc9426a0069e7d37be461f2b3183397d06153c74 Mon Sep 17 00:00:00 2001
From: KPRoche <kproche@us.ibm.com>
Date: Wed, 21 May 2025 11:08:12 -0700
Subject: [PATCH 2/2] adding KV-cache page and linker invite link

---
 .../{06_kvcache.md => 06_kv-cache.md}         | 98 ++++++++++++-------
 docusaurus.config.js                          | 13 +++
 src/css/custom.css                            |  3 +-
 3 files changed, 80 insertions(+), 34 deletions(-)
 rename docs/architecture/Components/{06_kvcache.md => 06_kv-cache.md} (56%)

diff --git a/docs/architecture/Components/06_kvcache.md b/docs/architecture/Components/06_kv-cache.md
similarity index 56%
rename from docs/architecture/Components/06_kvcache.md
rename to docs/architecture/Components/06_kv-cache.md
index 7bdc559..4de745c 100644
--- a/docs/architecture/Components/06_kvcache.md
+++ b/docs/architecture/Components/06_kv-cache.md
@@ -1,51 +1,79 @@
 ---
 sidecar_position: 06
-sidecar_label: KVCache Manager
+sidecar_label: KV-Cache Manager
 ---
-# KVCache Manager
+# KV-Cache Manager
 
 ## Introduction
 
 LLM inference can be computationally expensive due to the sequential nature of token generation. 
 KV-caching plays a critical role in optimizing this process. By storing previously computed key and value attention vectors, 
-KVCache reuse avoids redundant computations during inference, significantly reducing latency and resource consumption. 
+KV-cache reuse avoids redundant computations during inference, significantly reducing latency and resource consumption. 
 This is particularly beneficial for long context multi-turn conversations or Agentic (&RAG) applications where 
 previously computed information can be leveraged effectively. 
-Efficient KVCache management and routing are essential for scaling LLM inference and delivering a responsive user experience.
+Efficient KV-cache management and routing are essential for scaling LLM inference and delivering a responsive user experience.
 
-llmd-kv-cache-manager is a pluggable KVCache Manager for KVCache Aware Routing in vLLM-based serving platforms.
+llmd-kv-cache-manager is a pluggable KV-cache Manager for KV-cache Aware Routing in LLM serving platforms.
 
-See the [docs folder in the repository](https://github.com/llm-d/llm-d-kv-cache-manager/blob/main/docs/README.md) for more information on goals, architecture and more.
-## Overview
+This initial work will expand in capacity as development continues.
+ 
+ See the [docs folder in the repository](https://github.com/llm-d/llm-d-kv-cache-manager/blob/main/docs/README.md) for more information on goals, architecture and more.
 
-The code defines a [KVCacheIndexer](https://github.com/llm-d/llm-d-kv-cache-manager/tree/main/pkg/kv-cache/indexer.go) module that efficiently maintains a global view of KVCache states and localities. 
-In the current state of vLLM, the only available information on KVCache availability is that of the offloaded tensors to KVCache Engines via the Connector API.
+## Goals
 
-The `kvcache.Indexer` module is a pluggable Go package designed for use by orchestrators to enable KVCache-aware scheduling decisions.
+The KV-Cache-Manager is designed to connect high-level serving-stack goals with concrete system capabilities through a layered objective structure:
+
+- **Improve user experience** 
+  - By reducing Time-To-First-Token (TTFT)
+     - Enabled through higher KVCache hit rates and reduced tensor transfers
+     - Supported by smart routing and distributed cache availability
+     - Optimized by proactive pre-placement of hot caches and session duplication/migration
+- **Reduce serving costs**
+  - By improving compute utilization
+     - Minimize re-compute via KVCache reuse and locality-aware request handling
+     - Leverage zero-copy cache transfers across nodes
+
+## Vision 
+
+This goal structure above is shaped by our vision for emerging use cases like RAG and agentic workflows, 
+which involve heavy context-reuse across sessions and instances. 
+Shared documents, tool prompts, and workflow steps create overlapping token streams that benefit significantly from 
+cross-instance KVCache coordination. 
+
+To implement this vision, the KVCache-Manager incorporates proactive cache placement, session duplication, 
+and cluster-level cache APIs - bridging gaps in current serving stacks where KVCache management and utilization is 
+not yet treated as a first-class concern.
+
+## Architecture Overview
+
+The code defines a [kvcache.Indexer](https://github.com/llm-d/llm-d-kv-cache-manager/tree/main/pkg/kv-cache/indexer.go) module that efficiently maintains a global view of KV-cache states and localities. 
+In the current state of vLLM, the only available information on KV-cache availability is that of the offloaded tensors to KV-cache Engines via the Connector API.
+
+The `kvcache.Indexer` module is a pluggable Go package designed for use by orchestrators to enable KV-cache-aware scheduling decisions.
 
 ```mermaid
 graph 
   subgraph Cluster
     Router
-    subgraph KVCacheManager[KVCache Manager]
-      KVCacheIndexer[KVCache Indexer]
+    subgraph KVCacheManager[KV-cache Manager]
+      kvcache.Indexer[KV-cache Indexer]
       PrefixStore[LRU Prefix Store]
       KVBlockToPodIndex[KVBlock to Pod availability Index]
     end
     subgraph vLLMNode[vLLM Node]
       vLLMCore[vLLM Core]
-      KVCacheEngine["KVCache Engine (LMCache)"]
+      KVCacheEngine["KV-cache Engine (LMCache)"]
     end
     Redis
   end
 
-  Router -->|"Score(prompt, ModelName, relevantPods)"| KVCacheIndexer
-  KVCacheIndexer -->|"{Pod to Scores map}"| Router
+  Router -->|"Score(prompt, ModelName, relevantPods)"| kvcache.Indexer
+  kvcache.Indexer -->|"{Pod to Scores map}"| Router
   Router -->|Route| vLLMNode
   
-  KVCacheIndexer -->|"FindLongestTokenizedPrefix(prompt, ModelName) -> tokens"| PrefixStore
+  kvcache.Indexer -->|"FindLongestTokenizedPrefix(prompt, ModelName) -> tokens"| PrefixStore
   PrefixStore -->|"DigestPromptAsync"| PrefixStore
-  KVCacheIndexer -->|"GetPodsForKeys(tokens) -> {KVBlock keys to Pods} availability map"| KVBlockToPodIndex
+  kvcache.Indexer -->|"GetPodsForKeys(tokens) -> {KVBlock keys to Pods} availability map"| KVBlockToPodIndex
   KVBlockToPodIndex -->|"Redis MGet(blockKeys) -> {KVBlock keys to Pods}"| Redis
 
   vLLMCore -->|Connector API| KVCacheEngine
@@ -53,18 +81,20 @@ graph
 ```
 This overview greatly simplifies the actual architecture and combines steps across several submodules.
 
+
+
 ## Architecture 
 
 For even more a detailed architecture, refer to the [architecture](https://github.com/llm-d/llm-d-kv-cache-manager/tree/main/docs/architecture.md) document.
 
-The architecture is designed to efficiently maintain a global view of KVCache states and localities, enabling KVCache-aware scheduling decisions.
+The architecture is designed to efficiently maintain a global view of KV-cache states and localities, enabling KV-cache-aware scheduling decisions.
 
 ### Detailed System Flow
 
 ```mermaid
 sequenceDiagram
     participant U as User  
-    participant KVI as KVCacheIndexer
+    participant KVI as kvcache.Indexer
     box
         participant KVBS as KVBlockScorer
         participant TPR as TokenProcessor
@@ -83,7 +113,7 @@ sequenceDiagram
         participant CH as TokenizersCache
     end
 
-# KVCacheIndexer
+# kvcache.Indexer
 U->>KVI: 1. Score(prompt, ModelName, relevantPods)
 
 # get available tokens of longest prefix
@@ -102,7 +132,7 @@ KVI->>TPR: 3 GetBlockKeys(tokens, ModelName)
 # query kvblock indexer for pods
 KVI->>KVBI: 4. GetPodsForKeys(blockKeys, relevantPods)
 KVBI->>Redis: 4.1 MGet(blockKeys)
-Redis->>KVBI: 4.2 key -> Pods mapping (KVCache availability)
+Redis->>KVBI: 4.2 key -> Pods mapping (KV-cache availability)
 KVBI->>KVBI: 4.3 FilterPods(relevantPods)
 
 # score pods
@@ -127,21 +157,21 @@ end
 ```
 
 ### Explanation
-The main blocking sequence of steps that happens when a user (e.g., router) sends a request to the KVCacheIndexer is as follows:
-1. **User** sends a request to the **KVCacheIndexer** with a prompt, model name, and relevant pods.
-2. **KVCacheIndexer**:
+The main blocking sequence of steps that happens when a user (e.g., router) sends a request to the kvcache.Indexer is as follows:
+1. **User** sends a request to the **kvcache.Indexer** with a prompt, model name, and relevant pods.
+2. **kvcache.Indexer**:
    - Finds the longest tokenized prefix for the prompt and model name using the **PrefixStore**.
       - Depending on the store type (LRU or Trie), it gets the tokenization of the longest cached prefix
    - Adds a tokenization task to the **TokenizersPool**, which is handled asynchronously by a worker. This bit is explained later.
-3. **KVCacheIndexer** queries the **TokenProcessor** to get block keys for the tokens of the longest prefix.
+3. **kvcache.Indexer** queries the **TokenProcessor** to get block keys for the tokens of the longest prefix.
 4. **TokenProcessor**:
    - Chunks the tokens and generate keys for the token blocks. The chunking and key calculating has to be aligned with
      the source that feeds the key -> pods backend (Redis).
-   - Returns the block keys to the **KVCacheIndexer**.
-5. **KVCacheIndexer** queries the **KVBlockIndexer** for pods that have the block keys.
+   - Returns the block keys to the **kvcache.Indexer**.
+5. **kvcache.Indexer** queries the **KVBlockIndexer** for pods that have the block keys.
    - The **KVBlockIndexer** queries the **Redis** backend for the mappings with MGet.
    - The **Redis** backend efficiently returns the key -> pods mapping.
-6. **KVCacheIndexer** uses the configured **KVBlockScorer** to score the pods based block hits:
+6. **kvcache.Indexer** uses the configured **KVBlockScorer** to score the pods based block hits:
     - LongestPrefixMatch: scores by the longest consecutive (ordered) block hits in a single pod.
     - HighestBlockHit: scores by the index of the highest block hit in a single pod.
     - CoverageBasedMatching: scores by the total number of block hits in a single pod.
@@ -163,15 +193,17 @@ Asynchronous tokenization flow:
         since one token may be associated with the edge-characters of two consecutive chunks.
 
 ### Maintenance of Redis for KVBlock -> Pods Mapping
-In the current phase, LMCache is set up to use the same Redis server for indexing. For the scope of LMCache, this indexing
-is necessary for KVCache reuse through offloading and sharing.
+
+Currently, indexing information is updated from vLLM for the offloaded tokens using the Connector API, specifically leveraging the LMCache connector.
+
+Future enhancements will enable the `llm-d-kv-cache-manager` component to process KV-cache events across all memory layers of vLLM, ensuring an accurate holistic view of KV-cache localities throughout the system.
 
 
 
 ## Examples
 
-- [KVCache Indexer](https://github.com/llm-d/llm-d-kv-cache-manager/tree/main/examples/kv-cache-index/): 
+- [KV-cache Indexer](https://github.com/llm-d/llm-d-kv-cache-manager/tree/main/examples/kv-cache-index/): 
   - A reference implementation of using the `kvcache.Indexer` module.
-- [KVCache Aware Scorer](https://github.com/llm-d/llm-d-kv-cache-manager/tree/main/examples/kv-cache-aware-scorer/): 
+- [KV-cache Aware Scorer](https://github.com/llm-d/llm-d-kv-cache-manager/tree/main/examples/kv-cache-aware-scorer/): 
   - A reference implementation of integrating the `kvcache.Indexer` module in 
-  [llm-d-inference-scheduler](https://github.com/llm-d/llm-d-inference-scheduler) in a KVCache aware scorer.
+  [llm-d-inference-scheduler](https://github.com/llm-d/llm-d-inference-scheduler) in a KV-cache aware scorer.
diff --git a/docusaurus.config.js b/docusaurus.config.js
index d6190fe..aaa6be0 100644
--- a/docusaurus.config.js
+++ b/docusaurus.config.js
@@ -28,6 +28,7 @@ const config = {
   url: "https://llm-d.ai/",
   // Set the /<baseUrl>/ pathname under which your site is served
   // For GitHub pages deployment, it is often '/<projectName>/'
+  //baseUrl: "/llm-d.github.io/",
   baseUrl: "/",
 
   // GitHub pages deployment config.
@@ -214,6 +215,7 @@ const config = {
                 </a>
               `,
               },
+              
               {
                 html: `
                 <a href="https://llm-d.slack.com" target="_blank" rel="noreferrer noopener" aria-label="Slack Icon">
@@ -223,6 +225,17 @@ const config = {
                 </a>
               `,
               },
+              {
+                html: `
+                <a href="https://inviter.co/llm-d-slack" target="_blank" rel="noreferrer noopener" aria-label="Inviter Icon">
+                  <span class="button-link">Join our Slack</span>    
+                  <img class="inviter"
+                    src="https://raw.githubusercontent.com/KPRoche/iconography/refs/heads/main/assets/inviter-logo.png" 
+                    alt="Slack Inviter link" width="30px" height="auto" margin="5px"/>
+                </a>
+              `,
+              },
+              
               {
                 html: `
                 <a href="https://www.reddit.com/r/llm_d/" target="_blank" rel="noreferrer noopener" aria-label="Reddit Icon">
diff --git a/src/css/custom.css b/src/css/custom.css
index 85746f8..ae82a45 100644
--- a/src/css/custom.css
+++ b/src/css/custom.css
@@ -65,7 +65,8 @@ display: none !important;
 .linkedin,
 .slack,
 .reddit,
-.x {
+.x,
+.inviter {
   width: 28px;
   height: 28px;
   margin: 10px 0;